test_parse_jsonl_urls.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276
  1. #!/usr/bin/env python3
  2. """Unit tests for parse_jsonl_urls extractor."""
  3. import json
  4. import subprocess
  5. import sys
  6. from pathlib import Path
  7. import pytest
  8. PLUGIN_DIR = Path(__file__).parent.parent
  9. SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_jsonl_urls.*'), None)
  10. class TestParseJsonlUrls:
  11. """Test the parse_jsonl_urls extractor CLI."""
  12. def test_extracts_urls_from_jsonl(self, tmp_path):
  13. """Test extracting URLs from JSONL bookmark file."""
  14. input_file = tmp_path / 'bookmarks.jsonl'
  15. input_file.write_text(
  16. '{"url": "https://example.com", "title": "Example"}\n'
  17. '{"url": "https://foo.bar/page", "title": "Foo Bar"}\n'
  18. '{"url": "https://test.org", "title": "Test Org"}\n'
  19. )
  20. result = subprocess.run(
  21. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  22. cwd=tmp_path,
  23. capture_output=True,
  24. text=True,
  25. )
  26. assert result.returncode == 0
  27. assert 'urls.jsonl' in result.stderr or 'urls.jsonl' in result.stdout
  28. # Output goes to stdout (JSONL)
  29. lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
  30. assert len(lines) == 3
  31. entries = [json.loads(line) for line in lines]
  32. urls = {e['url'] for e in entries}
  33. titles = {e.get('title') for e in entries}
  34. assert 'https://example.com' in urls
  35. assert 'https://foo.bar/page' in urls
  36. assert 'https://test.org' in urls
  37. assert 'Example' in titles
  38. assert 'Foo Bar' in titles
  39. assert 'Test Org' in titles
  40. def test_supports_href_field(self, tmp_path):
  41. """Test that 'href' field is recognized as URL."""
  42. input_file = tmp_path / 'bookmarks.jsonl'
  43. input_file.write_text('{"href": "https://example.com", "title": "Test"}\n')
  44. result = subprocess.run(
  45. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  46. cwd=tmp_path,
  47. capture_output=True,
  48. text=True,
  49. )
  50. assert result.returncode == 0
  51. # Output goes to stdout (JSONL)
  52. lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
  53. entry = json.loads(lines[0])
  54. assert entry['url'] == 'https://example.com'
  55. def test_supports_description_as_title(self, tmp_path):
  56. """Test that 'description' field is used as title fallback."""
  57. input_file = tmp_path / 'bookmarks.jsonl'
  58. input_file.write_text('{"url": "https://example.com", "description": "A description"}\n')
  59. result = subprocess.run(
  60. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  61. cwd=tmp_path,
  62. capture_output=True,
  63. text=True,
  64. )
  65. assert result.returncode == 0
  66. # Output goes to stdout (JSONL)
  67. lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
  68. entry = json.loads(lines[0])
  69. assert entry['title'] == 'A description'
  70. def test_parses_various_timestamp_formats(self, tmp_path):
  71. """Test parsing of different timestamp field names."""
  72. input_file = tmp_path / 'bookmarks.jsonl'
  73. input_file.write_text('{"url": "https://example.com", "timestamp": 1609459200000000}\n')
  74. result = subprocess.run(
  75. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  76. cwd=tmp_path,
  77. capture_output=True,
  78. text=True,
  79. )
  80. assert result.returncode == 0
  81. # Output goes to stdout (JSONL)
  82. lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
  83. entry = json.loads(lines[0])
  84. # Parser converts timestamp to bookmarked_at
  85. assert 'bookmarked_at' in entry
  86. def test_parses_tags_as_string(self, tmp_path):
  87. """Test parsing tags as comma-separated string."""
  88. input_file = tmp_path / 'bookmarks.jsonl'
  89. input_file.write_text('{"url": "https://example.com", "tags": "tech,news,reading"}\n')
  90. result = subprocess.run(
  91. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  92. cwd=tmp_path,
  93. capture_output=True,
  94. text=True,
  95. )
  96. assert result.returncode == 0
  97. # Output goes to stdout (JSONL)
  98. # Parser converts tags to separate Tag objects in the output
  99. content = result.stdout
  100. assert 'tech' in content or 'news' in content or 'Tag' in content
  101. def test_parses_tags_as_list(self, tmp_path):
  102. """Test parsing tags as JSON array."""
  103. input_file = tmp_path / 'bookmarks.jsonl'
  104. input_file.write_text('{"url": "https://example.com", "tags": ["tech", "news"]}\n')
  105. result = subprocess.run(
  106. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  107. cwd=tmp_path,
  108. capture_output=True,
  109. text=True,
  110. )
  111. assert result.returncode == 0
  112. # Output goes to stdout (JSONL)
  113. # Parser converts tags to separate Tag objects in the output
  114. content = result.stdout
  115. assert 'tech' in content or 'news' in content or 'Tag' in content
  116. def test_skips_malformed_lines(self, tmp_path):
  117. """Test that malformed JSON lines are skipped."""
  118. input_file = tmp_path / 'bookmarks.jsonl'
  119. input_file.write_text(
  120. '{"url": "https://valid.com"}\n'
  121. 'not valid json\n'
  122. '{"url": "https://also-valid.com"}\n'
  123. )
  124. result = subprocess.run(
  125. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  126. cwd=tmp_path,
  127. capture_output=True,
  128. text=True,
  129. )
  130. assert result.returncode == 0
  131. # Output goes to stdout (JSONL)
  132. lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
  133. assert len(lines) == 2
  134. def test_skips_entries_without_url(self, tmp_path):
  135. """Test that entries without URL field are skipped."""
  136. input_file = tmp_path / 'bookmarks.jsonl'
  137. input_file.write_text(
  138. '{"url": "https://valid.com"}\n'
  139. '{"title": "No URL here"}\n'
  140. '{"url": "https://also-valid.com"}\n'
  141. )
  142. result = subprocess.run(
  143. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  144. cwd=tmp_path,
  145. capture_output=True,
  146. text=True,
  147. )
  148. assert result.returncode == 0
  149. # Output goes to stdout (JSONL)
  150. lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
  151. assert len(lines) == 2
  152. def test_skips_when_no_urls_found(self, tmp_path):
  153. """Test that script returns skipped status when no URLs found."""
  154. input_file = tmp_path / 'empty.jsonl'
  155. input_file.write_text('{"title": "No URL"}\n')
  156. result = subprocess.run(
  157. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  158. cwd=tmp_path,
  159. capture_output=True,
  160. text=True,
  161. )
  162. assert result.returncode == 0
  163. assert 'urls.jsonl' in result.stderr
  164. assert '"status": "skipped"' in result.stdout
  165. def test_exits_1_when_file_not_found(self, tmp_path):
  166. """Test that script exits with code 1 when file doesn't exist."""
  167. result = subprocess.run(
  168. [sys.executable, str(SCRIPT_PATH), '--url', 'file:///nonexistent/bookmarks.jsonl'],
  169. cwd=tmp_path,
  170. capture_output=True,
  171. text=True,
  172. )
  173. assert result.returncode == 1
  174. assert 'Failed to fetch' in result.stderr
  175. def test_handles_html_entities(self, tmp_path):
  176. """Test that HTML entities in URLs and titles are decoded."""
  177. input_file = tmp_path / 'bookmarks.jsonl'
  178. input_file.write_text('{"url": "https://example.com/page?a=1&b=2", "title": "Test & Title"}\n')
  179. result = subprocess.run(
  180. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  181. cwd=tmp_path,
  182. capture_output=True,
  183. text=True,
  184. )
  185. assert result.returncode == 0
  186. # Output goes to stdout (JSONL)
  187. lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
  188. entry = json.loads(lines[0])
  189. assert entry['url'] == 'https://example.com/page?a=1&b=2'
  190. assert entry['title'] == 'Test & Title'
  191. def test_skips_empty_lines(self, tmp_path):
  192. """Test that empty lines are skipped."""
  193. input_file = tmp_path / 'bookmarks.jsonl'
  194. input_file.write_text(
  195. '{"url": "https://example.com"}\n'
  196. '\n'
  197. ' \n'
  198. '{"url": "https://other.com"}\n'
  199. )
  200. result = subprocess.run(
  201. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  202. cwd=tmp_path,
  203. capture_output=True,
  204. text=True,
  205. )
  206. assert result.returncode == 0
  207. # Output goes to stdout (JSONL)
  208. lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
  209. assert len(lines) == 2
  210. def test_output_includes_required_fields(self, tmp_path):
  211. """Test that output includes required fields."""
  212. input_file = tmp_path / 'bookmarks.jsonl'
  213. input_file.write_text('{"url": "https://example.com"}\n')
  214. result = subprocess.run(
  215. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  216. cwd=tmp_path,
  217. capture_output=True,
  218. text=True,
  219. )
  220. assert result.returncode == 0
  221. # Output goes to stdout (JSONL)
  222. lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
  223. entry = json.loads(lines[0])
  224. assert entry['url'] == 'https://example.com'
  225. assert 'type' in entry
  226. assert 'plugin' in entry
  227. if __name__ == '__main__':
  228. pytest.main([__file__, '-v'])