test_parse_html_urls.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. #!/usr/bin/env python3
  2. """Unit tests for parse_html_urls extractor."""
  3. import json
  4. import subprocess
  5. import sys
  6. from pathlib import Path
  7. import pytest
  8. PLUGIN_DIR = Path(__file__).parent.parent
  9. SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_html_urls.*'), None)
  10. class TestParseHtmlUrls:
  11. """Test the parse_html_urls extractor CLI."""
  12. def test_parses_real_example_com(self, tmp_path):
  13. """Test parsing real https://example.com and extracting its links."""
  14. result = subprocess.run(
  15. [sys.executable, str(SCRIPT_PATH), '--url', 'https://example.com'],
  16. cwd=tmp_path,
  17. capture_output=True,
  18. text=True,
  19. timeout=30
  20. )
  21. assert result.returncode == 0, f"Failed to parse example.com: {result.stderr}"
  22. # Verify stdout contains JSONL records for discovered URLs
  23. # example.com links to iana.org
  24. assert 'iana.org' in result.stdout or 'example' in result.stdout, "Expected links from example.com not found"
  25. # Verify ArchiveResult record is present
  26. assert '"type": "ArchiveResult"' in result.stdout, "Missing ArchiveResult record"
  27. assert '"status": "succeeded"' in result.stdout, "Missing success status"
  28. def test_extracts_href_urls(self, tmp_path):
  29. """Test extracting URLs from anchor tags."""
  30. input_file = tmp_path / 'page.html'
  31. input_file.write_text('''
  32. <!DOCTYPE html>
  33. <html>
  34. <body>
  35. <a href="https://example.com">Example</a>
  36. <a href="https://foo.bar/page">Foo</a>
  37. <a href="http://test.org">Test</a>
  38. </body>
  39. </html>
  40. ''')
  41. result = subprocess.run(
  42. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  43. cwd=tmp_path,
  44. capture_output=True,
  45. text=True,
  46. )
  47. assert result.returncode == 0
  48. assert 'urls.jsonl' in result.stderr
  49. # Parse Snapshot records from stdout
  50. lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '"type": "Snapshot"' in line]
  51. assert len(lines) == 3, f"Expected 3 Snapshot records, got {len(lines)}"
  52. urls = set()
  53. for line in lines:
  54. entry = json.loads(line)
  55. assert entry['type'] == 'Snapshot'
  56. assert 'url' in entry
  57. urls.add(entry['url'])
  58. assert 'https://example.com' in urls
  59. assert 'https://foo.bar/page' in urls
  60. assert 'http://test.org' in urls
  61. # Verify ArchiveResult record
  62. assert '"type": "ArchiveResult"' in result.stdout
  63. assert '"status": "succeeded"' in result.stdout
  64. urls_file = tmp_path / 'urls.jsonl'
  65. assert urls_file.exists(), "urls.jsonl not created"
  66. file_lines = [line for line in urls_file.read_text().splitlines() if line.strip()]
  67. assert len(file_lines) == 3, f"Expected 3 urls.jsonl entries, got {len(file_lines)}"
  68. def test_ignores_non_http_schemes(self, tmp_path):
  69. """Test that non-http schemes are ignored."""
  70. input_file = tmp_path / 'page.html'
  71. input_file.write_text('''
  72. <html>
  73. <body>
  74. <a href="mailto:[email protected]">Email</a>
  75. <a href="javascript:void(0)">JS</a>
  76. <a href="tel:+1234567890">Phone</a>
  77. <a href="https://valid.com">Valid</a>
  78. </body>
  79. </html>
  80. ''')
  81. result = subprocess.run(
  82. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  83. cwd=tmp_path,
  84. capture_output=True,
  85. text=True,
  86. )
  87. assert result.returncode == 0
  88. # Parse Snapshot records from stdout
  89. lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '"type": "Snapshot"' in line]
  90. assert len(lines) == 1, f"Expected 1 Snapshot record, got {len(lines)}"
  91. entry = json.loads(lines[0])
  92. assert entry['url'] == 'https://valid.com'
  93. def test_handles_html_entities(self, tmp_path):
  94. """Test that HTML entities in URLs are decoded."""
  95. input_file = tmp_path / 'page.html'
  96. input_file.write_text('''
  97. <html>
  98. <body>
  99. <a href="https://example.com/page?a=1&amp;b=2">Link</a>
  100. </body>
  101. </html>
  102. ''')
  103. result = subprocess.run(
  104. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  105. cwd=tmp_path,
  106. capture_output=True,
  107. text=True,
  108. )
  109. assert result.returncode == 0
  110. lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
  111. entry = json.loads(lines[0])
  112. assert entry['url'] == 'https://example.com/page?a=1&b=2'
  113. def test_deduplicates_urls(self, tmp_path):
  114. """Test that duplicate URLs are deduplicated."""
  115. input_file = tmp_path / 'page.html'
  116. input_file.write_text('''
  117. <html>
  118. <body>
  119. <a href="https://example.com">Link 1</a>
  120. <a href="https://example.com">Link 2</a>
  121. <a href="https://example.com">Link 3</a>
  122. </body>
  123. </html>
  124. ''')
  125. result = subprocess.run(
  126. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  127. cwd=tmp_path,
  128. capture_output=True,
  129. text=True,
  130. )
  131. assert result.returncode == 0
  132. lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
  133. assert len(lines) == 1
  134. def test_excludes_source_url(self, tmp_path):
  135. """Test that the source URL itself is excluded from results."""
  136. input_file = tmp_path / 'page.html'
  137. source_url = f'file://{input_file}'
  138. input_file.write_text(f'''
  139. <html>
  140. <body>
  141. <a href="{source_url}">Self</a>
  142. <a href="https://other.com">Other</a>
  143. </body>
  144. </html>
  145. ''')
  146. result = subprocess.run(
  147. [sys.executable, str(SCRIPT_PATH), '--url', source_url],
  148. cwd=tmp_path,
  149. capture_output=True,
  150. text=True,
  151. )
  152. assert result.returncode == 0
  153. lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
  154. assert len(lines) == 1
  155. entry = json.loads(lines[0])
  156. assert entry['url'] == 'https://other.com'
  157. def test_skips_when_no_urls_found(self, tmp_path):
  158. """Test that script returns skipped status when no URLs found."""
  159. input_file = tmp_path / 'page.html'
  160. input_file.write_text('<html><body>No links here</body></html>')
  161. result = subprocess.run(
  162. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  163. cwd=tmp_path,
  164. capture_output=True,
  165. text=True,
  166. )
  167. assert result.returncode == 0
  168. assert 'urls.jsonl' in result.stderr
  169. assert '"status": "skipped"' in result.stdout
  170. def test_handles_malformed_html(self, tmp_path):
  171. """Test handling of malformed HTML."""
  172. input_file = tmp_path / 'malformed.html'
  173. input_file.write_text('''
  174. <html>
  175. <body>
  176. <a href="https://example.com">Unclosed tag
  177. <a href="https://other.com">Another link</a>
  178. </body>
  179. ''')
  180. result = subprocess.run(
  181. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  182. cwd=tmp_path,
  183. capture_output=True,
  184. text=True,
  185. )
  186. assert result.returncode == 0
  187. lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
  188. assert len(lines) == 2
  189. def test_output_is_valid_json(self, tmp_path):
  190. """Test that output contains required fields."""
  191. input_file = tmp_path / 'page.html'
  192. input_file.write_text('<a href="https://example.com">Link</a>')
  193. result = subprocess.run(
  194. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  195. cwd=tmp_path,
  196. capture_output=True,
  197. text=True,
  198. )
  199. assert result.returncode == 0
  200. lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
  201. entry = json.loads(lines[0])
  202. assert entry['url'] == 'https://example.com'
  203. assert entry['type'] == 'Snapshot'
  204. assert entry['plugin'] == 'parse_html_urls'
  205. if __name__ == '__main__':
  206. pytest.main([__file__, '-v'])