test_parse_rss_urls.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212
  1. #!/usr/bin/env python3
  2. """Unit tests for parse_rss_urls extractor."""
  3. import json
  4. import subprocess
  5. import sys
  6. from pathlib import Path
  7. import pytest
  8. PLUGIN_DIR = Path(__file__).parent.parent
  9. SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_rss_urls.*'), None)
  10. class TestParseRssUrls:
  11. """Test the parse_rss_urls extractor CLI."""
  12. def test_parses_real_rss_feed(self, tmp_path):
  13. """Test parsing a real RSS feed from the web."""
  14. # Use httpbin.org which provides a sample RSS feed
  15. result = subprocess.run(
  16. [sys.executable, str(SCRIPT_PATH), '--url', 'https://news.ycombinator.com/rss'],
  17. cwd=tmp_path,
  18. capture_output=True,
  19. text=True,
  20. timeout=30
  21. )
  22. # HN RSS feed should parse successfully
  23. if result.returncode == 0:
  24. # Output goes to stdout (JSONL)
  25. content = result.stdout
  26. assert len(content) > 0, "No URLs extracted from real RSS feed"
  27. # Verify at least one URL was extracted
  28. lines = content.strip().split('\n')
  29. assert len(lines) > 0, "No entries found in RSS feed"
  30. def test_extracts_urls_from_rss_feed(self, tmp_path):
  31. """Test extracting URLs from an RSS 2.0 feed."""
  32. input_file = tmp_path / 'feed.rss'
  33. input_file.write_text('''<?xml version="1.0" encoding="UTF-8"?>
  34. <rss version="2.0">
  35. <channel>
  36. <title>Test Feed</title>
  37. <link>https://example.com</link>
  38. <item>
  39. <title>First Post</title>
  40. <link>https://example.com/post/1</link>
  41. <pubDate>Mon, 01 Jan 2024 12:00:00 GMT</pubDate>
  42. </item>
  43. <item>
  44. <title>Second Post</title>
  45. <link>https://example.com/post/2</link>
  46. <pubDate>Tue, 02 Jan 2024 12:00:00 GMT</pubDate>
  47. </item>
  48. </channel>
  49. </rss>
  50. ''')
  51. result = subprocess.run(
  52. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  53. cwd=tmp_path,
  54. capture_output=True,
  55. text=True,
  56. )
  57. assert result.returncode == 0
  58. assert 'urls.jsonl' in result.stderr or 'urls.jsonl' in result.stdout
  59. # Output goes to stdout (JSONL)
  60. lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
  61. assert len(lines) == 2
  62. entries = [json.loads(line) for line in lines]
  63. urls = {e['url'] for e in entries}
  64. titles = {e.get('title') for e in entries}
  65. assert 'https://example.com/post/1' in urls
  66. assert 'https://example.com/post/2' in urls
  67. assert 'First Post' in titles
  68. assert 'Second Post' in titles
  69. def test_extracts_urls_from_atom_feed(self, tmp_path):
  70. """Test extracting URLs from an Atom feed."""
  71. input_file = tmp_path / 'feed.atom'
  72. input_file.write_text('''<?xml version="1.0" encoding="UTF-8"?>
  73. <feed xmlns="http://www.w3.org/2005/Atom">
  74. <title>Test Atom Feed</title>
  75. <entry>
  76. <title>Atom Post 1</title>
  77. <link href="https://atom.example.com/entry/1"/>
  78. <updated>2024-01-01T12:00:00Z</updated>
  79. </entry>
  80. <entry>
  81. <title>Atom Post 2</title>
  82. <link href="https://atom.example.com/entry/2"/>
  83. <updated>2024-01-02T12:00:00Z</updated>
  84. </entry>
  85. </feed>
  86. ''')
  87. result = subprocess.run(
  88. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  89. cwd=tmp_path,
  90. capture_output=True,
  91. text=True,
  92. )
  93. assert result.returncode == 0
  94. # Output goes to stdout (JSONL)
  95. lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
  96. urls = {json.loads(line)['url'] for line in lines}
  97. assert 'https://atom.example.com/entry/1' in urls
  98. assert 'https://atom.example.com/entry/2' in urls
  99. def test_skips_when_no_entries(self, tmp_path):
  100. """Test that script returns skipped status when feed has no entries."""
  101. input_file = tmp_path / 'empty.rss'
  102. input_file.write_text('''<?xml version="1.0"?>
  103. <rss version="2.0">
  104. <channel>
  105. <title>Empty Feed</title>
  106. </channel>
  107. </rss>
  108. ''')
  109. result = subprocess.run(
  110. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  111. cwd=tmp_path,
  112. capture_output=True,
  113. text=True,
  114. )
  115. assert result.returncode == 0
  116. assert 'urls.jsonl' in result.stderr
  117. assert '"status": "skipped"' in result.stdout
  118. def test_exits_1_when_file_not_found(self, tmp_path):
  119. """Test that script exits with code 1 when file doesn't exist."""
  120. result = subprocess.run(
  121. [sys.executable, str(SCRIPT_PATH), '--url', 'file:///nonexistent/feed.rss'],
  122. cwd=tmp_path,
  123. capture_output=True,
  124. text=True,
  125. )
  126. assert result.returncode == 1
  127. assert 'Failed to fetch' in result.stderr
  128. def test_handles_html_entities_in_urls(self, tmp_path):
  129. """Test that HTML entities in URLs are decoded."""
  130. input_file = tmp_path / 'feed.rss'
  131. input_file.write_text('''<?xml version="1.0"?>
  132. <rss version="2.0">
  133. <channel>
  134. <item>
  135. <title>Entity Test</title>
  136. <link>https://example.com/page?a=1&amp;b=2</link>
  137. </item>
  138. </channel>
  139. </rss>
  140. ''')
  141. result = subprocess.run(
  142. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  143. cwd=tmp_path,
  144. capture_output=True,
  145. text=True,
  146. )
  147. assert result.returncode == 0
  148. # Output goes to stdout (JSONL)
  149. lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
  150. entry = json.loads(lines[0])
  151. assert entry['url'] == 'https://example.com/page?a=1&b=2'
  152. def test_includes_optional_metadata(self, tmp_path):
  153. """Test that title and timestamp are included when present."""
  154. input_file = tmp_path / 'feed.rss'
  155. input_file.write_text('''<?xml version="1.0"?>
  156. <rss version="2.0">
  157. <channel>
  158. <item>
  159. <title>Test Title</title>
  160. <link>https://example.com/test</link>
  161. <pubDate>Wed, 15 Jan 2020 10:30:00 GMT</pubDate>
  162. </item>
  163. </channel>
  164. </rss>
  165. ''')
  166. result = subprocess.run(
  167. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  168. cwd=tmp_path,
  169. capture_output=True,
  170. text=True,
  171. )
  172. assert result.returncode == 0
  173. # Output goes to stdout (JSONL)
  174. lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
  175. entry = json.loads(lines[0])
  176. assert entry['url'] == 'https://example.com/test'
  177. assert entry['title'] == 'Test Title'
  178. # Parser converts timestamp to bookmarked_at
  179. assert 'bookmarked_at' in entry
  180. if __name__ == '__main__':
  181. pytest.main([__file__, '-v'])