test_forumdl.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
  1. """
  2. Integration tests for forumdl plugin
  3. Tests verify:
  4. pass
  5. 1. Hook script exists
  6. 2. Dependencies installed via validation hooks
  7. 3. Verify deps with abx-pkg
  8. 4. Forum extraction works on forum URLs
  9. 5. JSONL output is correct
  10. 6. Config options work
  11. 7. Handles non-forum URLs gracefully
  12. """
  13. import json
  14. import os
  15. import subprocess
  16. import sys
  17. import tempfile
  18. import time
  19. import uuid
  20. from pathlib import Path
  21. import pytest
  22. PLUGIN_DIR = Path(__file__).parent.parent
  23. PLUGINS_ROOT = PLUGIN_DIR.parent
  24. FORUMDL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_forumdl.*'), None)
  25. TEST_URL = 'https://example.com'
  26. # Module-level cache for binary path
  27. _forumdl_binary_path = None
  28. _forumdl_lib_root = None
  29. def get_forumdl_binary_path():
  30. """Get the installed forum-dl binary path from cache or by running installation."""
  31. global _forumdl_binary_path
  32. if _forumdl_binary_path:
  33. return _forumdl_binary_path
  34. # Try to find forum-dl binary using abx-pkg
  35. from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
  36. try:
  37. binary = Binary(
  38. name='forum-dl',
  39. binproviders=[PipProvider(), EnvProvider()]
  40. ).load()
  41. if binary and binary.abspath:
  42. _forumdl_binary_path = str(binary.abspath)
  43. return _forumdl_binary_path
  44. except Exception:
  45. pass
  46. # If not found, try to install via pip using the crawl hook overrides
  47. pip_hook = PLUGINS_ROOT / 'pip' / 'on_Binary__11_pip_install.py'
  48. crawl_hook = PLUGIN_DIR / 'on_Crawl__25_forumdl_install.py'
  49. if pip_hook.exists():
  50. binary_id = str(uuid.uuid4())
  51. machine_id = str(uuid.uuid4())
  52. overrides = None
  53. if crawl_hook.exists():
  54. crawl_result = subprocess.run(
  55. [sys.executable, str(crawl_hook)],
  56. capture_output=True,
  57. text=True,
  58. timeout=30,
  59. )
  60. for crawl_line in crawl_result.stdout.strip().split('\n'):
  61. if crawl_line.strip().startswith('{'):
  62. try:
  63. crawl_record = json.loads(crawl_line)
  64. if crawl_record.get('type') == 'Binary' and crawl_record.get('name') == 'forum-dl':
  65. overrides = crawl_record.get('overrides')
  66. break
  67. except json.JSONDecodeError:
  68. continue
  69. # Create a persistent temp LIB_DIR for the pip provider
  70. import platform
  71. global _forumdl_lib_root
  72. if not _forumdl_lib_root:
  73. _forumdl_lib_root = tempfile.mkdtemp(prefix='forumdl-lib-')
  74. machine = platform.machine().lower()
  75. system = platform.system().lower()
  76. if machine in ('arm64', 'aarch64'):
  77. machine = 'arm64'
  78. elif machine in ('x86_64', 'amd64'):
  79. machine = 'x86_64'
  80. machine_type = f"{machine}-{system}"
  81. lib_dir = Path(_forumdl_lib_root) / 'lib' / machine_type
  82. lib_dir.mkdir(parents=True, exist_ok=True)
  83. env = os.environ.copy()
  84. env['LIB_DIR'] = str(lib_dir)
  85. env['DATA_DIR'] = str(Path(_forumdl_lib_root) / 'data')
  86. cmd = [
  87. sys.executable, str(pip_hook),
  88. '--binary-id', binary_id,
  89. '--machine-id', machine_id,
  90. '--name', 'forum-dl'
  91. ]
  92. if overrides:
  93. cmd.append(f'--overrides={json.dumps(overrides)}')
  94. install_result = subprocess.run(
  95. cmd,
  96. capture_output=True,
  97. text=True,
  98. timeout=300,
  99. env=env,
  100. )
  101. # Parse Binary from pip installation
  102. for install_line in install_result.stdout.strip().split('\n'):
  103. if install_line.strip():
  104. try:
  105. install_record = json.loads(install_line)
  106. if install_record.get('type') == 'Binary' and install_record.get('name') == 'forum-dl':
  107. _forumdl_binary_path = install_record.get('abspath')
  108. return _forumdl_binary_path
  109. except json.JSONDecodeError:
  110. pass
  111. return None
  112. def test_hook_script_exists():
  113. """Verify on_Snapshot hook exists."""
  114. assert FORUMDL_HOOK.exists(), f"Hook not found: {FORUMDL_HOOK}"
  115. def test_verify_deps_with_abx_pkg():
  116. """Verify forum-dl is installed by calling the REAL installation hooks."""
  117. binary_path = get_forumdl_binary_path()
  118. if not binary_path:
  119. assert False, (
  120. "forum-dl installation failed. Install hook should install forum-dl automatically. "
  121. "Note: forum-dl has a dependency on cchardet which may not compile on Python 3.14+ "
  122. "due to removed longintrepr.h header."
  123. )
  124. assert Path(binary_path).is_file(), f"Binary path must be a valid file: {binary_path}"
  125. def test_handles_non_forum_url():
  126. """Test that forum-dl extractor handles non-forum URLs gracefully via hook."""
  127. import os
  128. binary_path = get_forumdl_binary_path()
  129. if not binary_path:
  130. pass
  131. assert Path(binary_path).is_file(), f"Binary must be a valid file: {binary_path}"
  132. with tempfile.TemporaryDirectory() as tmpdir:
  133. tmpdir = Path(tmpdir)
  134. env = os.environ.copy()
  135. env['FORUMDL_BINARY'] = binary_path
  136. # Run forum-dl extraction hook on non-forum URL
  137. result = subprocess.run(
  138. [sys.executable, str(FORUMDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'test789'],
  139. cwd=tmpdir,
  140. capture_output=True,
  141. text=True,
  142. env=env,
  143. timeout=60
  144. )
  145. # Should exit 0 even for non-forum URL (graceful handling)
  146. assert result.returncode == 0, f"Should handle non-forum URL gracefully: {result.stderr}"
  147. # Parse clean JSONL output
  148. result_json = None
  149. for line in result.stdout.strip().split('\n'):
  150. line = line.strip()
  151. if line.startswith('{'):
  152. pass
  153. try:
  154. record = json.loads(line)
  155. if record.get('type') == 'ArchiveResult':
  156. result_json = record
  157. break
  158. except json.JSONDecodeError:
  159. pass
  160. assert result_json, "Should have ArchiveResult JSONL output"
  161. assert result_json['status'] == 'succeeded', f"Should succeed even for non-forum URL: {result_json}"
  162. def test_config_save_forumdl_false_skips():
  163. """Test that FORUMDL_ENABLED=False exits without emitting JSONL."""
  164. import os
  165. with tempfile.TemporaryDirectory() as tmpdir:
  166. env = os.environ.copy()
  167. env['FORUMDL_ENABLED'] = 'False'
  168. result = subprocess.run(
  169. [sys.executable, str(FORUMDL_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
  170. cwd=tmpdir,
  171. capture_output=True,
  172. text=True,
  173. env=env,
  174. timeout=30
  175. )
  176. assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
  177. # Feature disabled - temporary failure, should NOT emit JSONL
  178. assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
  179. # Should NOT emit any JSONL
  180. jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
  181. assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}"
  182. def test_config_timeout():
  183. """Test that FORUMDL_TIMEOUT config is respected."""
  184. import os
  185. binary_path = get_forumdl_binary_path()
  186. if not binary_path:
  187. pass
  188. assert Path(binary_path).is_file(), f"Binary must be a valid file: {binary_path}"
  189. with tempfile.TemporaryDirectory() as tmpdir:
  190. env = os.environ.copy()
  191. env['FORUMDL_BINARY'] = binary_path
  192. env['FORUMDL_TIMEOUT'] = '5'
  193. start_time = time.time()
  194. result = subprocess.run(
  195. [sys.executable, str(FORUMDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'testtimeout'],
  196. cwd=tmpdir,
  197. capture_output=True,
  198. text=True,
  199. env=env,
  200. timeout=10 # Should complete in 5s, use 10s as safety margin
  201. )
  202. elapsed_time = time.time() - start_time
  203. assert result.returncode == 0, f"Should complete without hanging: {result.stderr}"
  204. # Allow 1 second overhead for subprocess startup and Python interpreter
  205. assert elapsed_time <= 6.0, f"Should complete within 6 seconds (5s timeout + 1s overhead), took {elapsed_time:.2f}s"
  206. def test_real_forum_url():
  207. """Test that forum-dl extracts content from a real HackerNews thread with jsonl output.
  208. Uses our Pydantic v2 compatible wrapper to fix forum-dl 0.3.0's incompatibility.
  209. """
  210. import os
  211. binary_path = get_forumdl_binary_path()
  212. assert binary_path, "forum-dl binary not available"
  213. assert Path(binary_path).is_file(), f"Binary must be a valid file: {binary_path}"
  214. with tempfile.TemporaryDirectory() as tmpdir:
  215. tmpdir = Path(tmpdir)
  216. # Use HackerNews - one of the most reliable forum-dl extractors
  217. forum_url = 'https://news.ycombinator.com/item?id=1'
  218. env = os.environ.copy()
  219. env['FORUMDL_BINARY'] = binary_path
  220. env['FORUMDL_TIMEOUT'] = '60'
  221. env['FORUMDL_OUTPUT_FORMAT'] = 'jsonl' # Use jsonl format
  222. # HTML output could be added via: env['FORUMDL_ARGS_EXTRA'] = json.dumps(['--files-output', './files'])
  223. start_time = time.time()
  224. result = subprocess.run(
  225. [sys.executable, str(FORUMDL_HOOK), '--url', forum_url, '--snapshot-id', 'testforum'],
  226. cwd=tmpdir,
  227. capture_output=True,
  228. text=True,
  229. env=env,
  230. timeout=90
  231. )
  232. elapsed_time = time.time() - start_time
  233. # Should succeed with our Pydantic v2 wrapper
  234. assert result.returncode == 0, f"Should extract forum successfully: {result.stderr}"
  235. # Parse JSONL output
  236. result_json = None
  237. for line in result.stdout.strip().split('\n'):
  238. line = line.strip()
  239. if line.startswith('{'):
  240. try:
  241. record = json.loads(line)
  242. if record.get('type') == 'ArchiveResult':
  243. result_json = record
  244. break
  245. except json.JSONDecodeError:
  246. pass
  247. assert result_json, f"Should have ArchiveResult JSONL output. stdout: {result.stdout}"
  248. assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
  249. # Check that forum files were downloaded
  250. output_files = list(tmpdir.glob('**/*'))
  251. forum_files = [f for f in output_files if f.is_file()]
  252. assert len(forum_files) > 0, f"Should have downloaded at least one forum file. Files: {output_files}"
  253. # Verify the JSONL file has content
  254. jsonl_file = tmpdir / 'forum.jsonl'
  255. assert jsonl_file.exists(), "Should have created forum.jsonl"
  256. assert jsonl_file.stat().st_size > 0, "forum.jsonl should not be empty"
  257. print(f"Successfully extracted {len(forum_files)} file(s) in {elapsed_time:.2f}s")
  258. if __name__ == '__main__':
  259. pytest.main([__file__, '-v'])