test_parse_txt_urls.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
  1. #!/usr/bin/env python3
  2. """Unit tests for parse_txt_urls extractor."""
  3. import json
  4. import subprocess
  5. import sys
  6. from pathlib import Path
  7. import pytest
  8. PLUGIN_DIR = Path(__file__).parent.parent
  9. SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_txt_urls.*'), None)
  10. class TestParseTxtUrls:
  11. """Test the parse_txt_urls extractor CLI."""
  12. def test_extracts_urls_including_real_example_com(self, tmp_path):
  13. """Test extracting URLs from plain text including real example.com."""
  14. input_file = tmp_path / 'urls.txt'
  15. input_file.write_text('''
  16. https://example.com
  17. https://example.com/page
  18. https://www.iana.org/domains/reserved
  19. ''')
  20. result = subprocess.run(
  21. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  22. cwd=tmp_path,
  23. capture_output=True,
  24. text=True,
  25. )
  26. assert result.returncode == 0, f"Failed: {result.stderr}"
  27. assert 'urls.jsonl' in result.stderr
  28. # Parse Snapshot records from stdout
  29. lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '"type": "Snapshot"' in line]
  30. assert len(lines) == 3
  31. urls = set()
  32. for line in lines:
  33. entry = json.loads(line)
  34. assert entry['type'] == 'Snapshot'
  35. assert 'url' in entry
  36. urls.add(entry['url'])
  37. # Verify real URLs are extracted correctly
  38. assert 'https://example.com' in urls
  39. assert 'https://example.com/page' in urls
  40. assert 'https://www.iana.org/domains/reserved' in urls
  41. # Verify ArchiveResult record
  42. assert '"type": "ArchiveResult"' in result.stdout
  43. assert '"status": "succeeded"' in result.stdout
  44. def test_extracts_urls_from_mixed_content(self, tmp_path):
  45. """Test extracting URLs embedded in prose text."""
  46. input_file = tmp_path / 'mixed.txt'
  47. input_file.write_text('''
  48. Check out this great article at https://blog.example.com/post
  49. You can also visit http://docs.test.org for more info.
  50. Also see https://github.com/user/repo for the code.
  51. ''')
  52. result = subprocess.run(
  53. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  54. cwd=tmp_path,
  55. capture_output=True,
  56. text=True,
  57. )
  58. assert result.returncode == 0
  59. lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
  60. urls = {json.loads(line)['url'] for line in lines}
  61. assert 'https://blog.example.com/post' in urls
  62. assert 'http://docs.test.org' in urls
  63. assert 'https://github.com/user/repo' in urls
  64. def test_handles_markdown_urls(self, tmp_path):
  65. """Test handling URLs in markdown format with parentheses."""
  66. input_file = tmp_path / 'markdown.txt'
  67. input_file.write_text('''
  68. [Example](https://example.com/page)
  69. [Wiki](https://en.wikipedia.org/wiki/Article_(Disambiguation))
  70. ''')
  71. result = subprocess.run(
  72. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  73. cwd=tmp_path,
  74. capture_output=True,
  75. text=True,
  76. )
  77. assert result.returncode == 0
  78. lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
  79. urls = {json.loads(line)['url'] for line in lines}
  80. assert 'https://example.com/page' in urls
  81. assert any('wikipedia.org' in u for u in urls)
  82. def test_skips_when_no_urls_found(self, tmp_path):
  83. """Test that script returns skipped status when no URLs found."""
  84. input_file = tmp_path / 'empty.txt'
  85. input_file.write_text('no urls here, just plain text')
  86. result = subprocess.run(
  87. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  88. cwd=tmp_path,
  89. capture_output=True,
  90. text=True,
  91. )
  92. assert result.returncode == 0
  93. assert 'urls.jsonl' in result.stderr
  94. assert '"status": "skipped"' in result.stdout
  95. def test_exits_1_when_file_not_found(self, tmp_path):
  96. """Test that script exits with code 1 when file doesn't exist."""
  97. result = subprocess.run(
  98. [sys.executable, str(SCRIPT_PATH), '--url', 'file:///nonexistent/path.txt'],
  99. cwd=tmp_path,
  100. capture_output=True,
  101. text=True,
  102. )
  103. assert result.returncode == 1
  104. assert 'Failed to fetch' in result.stderr
  105. def test_deduplicates_urls(self, tmp_path):
  106. """Test that duplicate URLs are deduplicated."""
  107. input_file = tmp_path / 'dupes.txt'
  108. input_file.write_text('''
  109. https://example.com
  110. https://example.com
  111. https://example.com
  112. https://other.com
  113. ''')
  114. result = subprocess.run(
  115. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  116. cwd=tmp_path,
  117. capture_output=True,
  118. text=True,
  119. )
  120. assert result.returncode == 0
  121. lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
  122. assert len(lines) == 2
  123. def test_outputs_to_stdout(self, tmp_path):
  124. """Test that output goes to stdout in JSONL format."""
  125. input_file = tmp_path / 'urls.txt'
  126. input_file.write_text('https://new.com\nhttps://other.com')
  127. result = subprocess.run(
  128. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  129. cwd=tmp_path,
  130. capture_output=True,
  131. text=True,
  132. )
  133. assert result.returncode == 0
  134. lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
  135. assert len(lines) == 2
  136. urls = {json.loads(line)['url'] for line in lines}
  137. assert 'https://new.com' in urls
  138. assert 'https://other.com' in urls
  139. def test_output_is_valid_json(self, tmp_path):
  140. """Test that output contains required fields."""
  141. input_file = tmp_path / 'urls.txt'
  142. input_file.write_text('https://example.com')
  143. result = subprocess.run(
  144. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  145. cwd=tmp_path,
  146. capture_output=True,
  147. text=True,
  148. )
  149. assert result.returncode == 0
  150. lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
  151. entry = json.loads(lines[0])
  152. assert entry['url'] == 'https://example.com'
  153. assert entry['type'] == 'Snapshot'
  154. assert entry['plugin'] == 'parse_txt_urls'
  155. if __name__ == '__main__':
  156. pytest.main([__file__, '-v'])