test_parse_netscape_urls.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208
  1. #!/usr/bin/env python3
  2. """Unit tests for parse_netscape_urls extractor."""
  3. import json
  4. import subprocess
  5. import sys
  6. from pathlib import Path
  7. import pytest
  8. PLUGIN_DIR = Path(__file__).parent.parent
  9. SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_netscape_urls.*'), None)
  10. class TestParseNetscapeUrls:
  11. """Test the parse_netscape_urls extractor CLI."""
  12. def test_extracts_urls_from_netscape_bookmarks(self, tmp_path):
  13. """Test extracting URLs from Netscape bookmark HTML format."""
  14. input_file = tmp_path / 'bookmarks.html'
  15. input_file.write_text('''<!DOCTYPE NETSCAPE-Bookmark-file-1>
  16. <META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
  17. <TITLE>Bookmarks</TITLE>
  18. <H1>Bookmarks</H1>
  19. <DL><p>
  20. <DT><A HREF="https://example.com" ADD_DATE="1609459200">Example Site</A>
  21. <DT><A HREF="https://foo.bar/page" ADD_DATE="1609545600">Foo Bar</A>
  22. <DT><A HREF="https://test.org" ADD_DATE="1609632000">Test Org</A>
  23. </DL><p>
  24. ''')
  25. result = subprocess.run(
  26. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  27. cwd=tmp_path,
  28. capture_output=True,
  29. text=True,
  30. )
  31. assert result.returncode == 0
  32. assert 'urls.jsonl' in result.stderr or 'urls.jsonl' in result.stdout
  33. # Output goes to stdout (JSONL)
  34. lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
  35. assert len(lines) == 3
  36. entries = [json.loads(line) for line in lines]
  37. urls = {e['url'] for e in entries}
  38. titles = {e.get('title') for e in entries}
  39. assert 'https://example.com' in urls
  40. assert 'https://foo.bar/page' in urls
  41. assert 'https://test.org' in urls
  42. assert 'Example Site' in titles
  43. assert 'Foo Bar' in titles
  44. assert 'Test Org' in titles
  45. def test_parses_add_date_timestamps(self, tmp_path):
  46. """Test that ADD_DATE timestamps are parsed correctly."""
  47. input_file = tmp_path / 'bookmarks.html'
  48. input_file.write_text('''
  49. <DT><A HREF="https://example.com" ADD_DATE="1609459200">Test</A>
  50. ''')
  51. result = subprocess.run(
  52. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  53. cwd=tmp_path,
  54. capture_output=True,
  55. text=True,
  56. )
  57. assert result.returncode == 0
  58. # Output goes to stdout (JSONL)
  59. lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
  60. entry = json.loads(lines[0])
  61. # Parser converts timestamp to bookmarked_at
  62. assert 'bookmarked_at' in entry
  63. def test_handles_query_params_in_urls(self, tmp_path):
  64. """Test that URLs with query parameters are preserved."""
  65. input_file = tmp_path / 'bookmarks.html'
  66. input_file.write_text('''
  67. <DT><A HREF="https://example.com/search?q=test+query&page=1" ADD_DATE="1609459200">Search</A>
  68. ''')
  69. result = subprocess.run(
  70. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  71. cwd=tmp_path,
  72. capture_output=True,
  73. text=True,
  74. )
  75. assert result.returncode == 0
  76. # Output goes to stdout (JSONL)
  77. lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
  78. entry = json.loads(lines[0])
  79. assert 'q=test+query' in entry['url']
  80. assert 'page=1' in entry['url']
  81. def test_handles_html_entities(self, tmp_path):
  82. """Test that HTML entities in URLs and titles are decoded."""
  83. input_file = tmp_path / 'bookmarks.html'
  84. input_file.write_text('''
  85. <DT><A HREF="https://example.com/page?a=1&amp;b=2" ADD_DATE="1609459200">Test &amp; Title</A>
  86. ''')
  87. result = subprocess.run(
  88. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  89. cwd=tmp_path,
  90. capture_output=True,
  91. text=True,
  92. )
  93. assert result.returncode == 0
  94. # Output goes to stdout (JSONL)
  95. lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
  96. entry = json.loads(lines[0])
  97. assert entry['url'] == 'https://example.com/page?a=1&b=2'
  98. assert entry['title'] == 'Test & Title'
  99. def test_skips_when_no_bookmarks_found(self, tmp_path):
  100. """Test that script returns skipped status when no bookmarks found."""
  101. input_file = tmp_path / 'empty.html'
  102. input_file.write_text('''<!DOCTYPE NETSCAPE-Bookmark-file-1>
  103. <TITLE>Bookmarks</TITLE>
  104. <H1>Bookmarks</H1>
  105. <DL><p>
  106. </DL><p>
  107. ''')
  108. result = subprocess.run(
  109. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  110. cwd=tmp_path,
  111. capture_output=True,
  112. text=True,
  113. )
  114. assert result.returncode == 0
  115. assert 'urls.jsonl' in result.stderr
  116. assert '"status": "skipped"' in result.stdout
  117. def test_exits_1_when_file_not_found(self, tmp_path):
  118. """Test that script exits with code 1 when file doesn't exist."""
  119. result = subprocess.run(
  120. [sys.executable, str(SCRIPT_PATH), '--url', 'file:///nonexistent/bookmarks.html'],
  121. cwd=tmp_path,
  122. capture_output=True,
  123. text=True,
  124. )
  125. assert result.returncode == 1
  126. assert 'Failed to fetch' in result.stderr
  127. def test_handles_nested_folders(self, tmp_path):
  128. """Test parsing bookmarks in nested folder structure."""
  129. input_file = tmp_path / 'bookmarks.html'
  130. input_file.write_text('''<!DOCTYPE NETSCAPE-Bookmark-file-1>
  131. <DL><p>
  132. <DT><H3>Folder 1</H3>
  133. <DL><p>
  134. <DT><A HREF="https://example.com/nested1" ADD_DATE="1609459200">Nested 1</A>
  135. <DT><H3>Subfolder</H3>
  136. <DL><p>
  137. <DT><A HREF="https://example.com/nested2" ADD_DATE="1609459200">Nested 2</A>
  138. </DL><p>
  139. </DL><p>
  140. <DT><A HREF="https://example.com/top" ADD_DATE="1609459200">Top Level</A>
  141. </DL><p>
  142. ''')
  143. result = subprocess.run(
  144. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  145. cwd=tmp_path,
  146. capture_output=True,
  147. text=True,
  148. )
  149. assert result.returncode == 0
  150. # Output goes to stdout (JSONL)
  151. lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
  152. urls = {json.loads(line)['url'] for line in lines}
  153. assert 'https://example.com/nested1' in urls
  154. assert 'https://example.com/nested2' in urls
  155. assert 'https://example.com/top' in urls
  156. def test_case_insensitive_parsing(self, tmp_path):
  157. """Test that parsing is case-insensitive for HTML tags."""
  158. input_file = tmp_path / 'bookmarks.html'
  159. input_file.write_text('''
  160. <dt><a HREF="https://example.com" ADD_DATE="1609459200">Test</a>
  161. ''')
  162. result = subprocess.run(
  163. [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
  164. cwd=tmp_path,
  165. capture_output=True,
  166. text=True,
  167. )
  168. assert result.returncode == 0
  169. # Output goes to stdout (JSONL)
  170. lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
  171. entry = json.loads(lines[0])
  172. assert entry['url'] == 'https://example.com'
  173. if __name__ == '__main__':
  174. pytest.main([__file__, '-v'])