test_favicon.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293
  1. """
  2. Integration tests for favicon plugin
  3. Tests verify:
  4. 1. Plugin script exists
  5. 2. requests library is available
  6. 3. Favicon extraction works for real example.com
  7. 4. Output file is actual image data
  8. 5. Tries multiple favicon URLs
  9. 6. Falls back to Google's favicon service
  10. 7. Config options work (TIMEOUT, USER_AGENT)
  11. 8. Handles failures gracefully
  12. """
  13. import json
  14. import subprocess
  15. import sys
  16. import tempfile
  17. from pathlib import Path
  18. import pytest
  19. from archivebox.plugins.chrome.tests.chrome_test_helpers import (
  20. get_plugin_dir,
  21. get_hook_script,
  22. parse_jsonl_output,
  23. )
  24. PLUGIN_DIR = get_plugin_dir(__file__)
  25. FAVICON_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_favicon.*')
  26. TEST_URL = 'https://example.com'
  27. def test_hook_script_exists():
  28. """Verify hook script exists."""
  29. assert FAVICON_HOOK.exists(), f"Hook script not found: {FAVICON_HOOK}"
  30. def test_requests_library_available():
  31. """Test that requests library is available."""
  32. result = subprocess.run(
  33. [sys.executable, '-c', 'import requests; print(requests.__version__)'],
  34. capture_output=True,
  35. text=True
  36. )
  37. if result.returncode != 0:
  38. pass
  39. assert len(result.stdout.strip()) > 0, "Should report requests version"
  40. def test_extracts_favicon_from_example_com():
  41. """Test full workflow: extract favicon from real example.com.
  42. Note: example.com doesn't have a favicon and Google's service may also fail,
  43. so we test that the extraction completes and reports appropriate status.
  44. """
  45. # Check requests is available
  46. check_result = subprocess.run(
  47. [sys.executable, '-c', 'import requests'],
  48. capture_output=True
  49. )
  50. if check_result.returncode != 0:
  51. pass
  52. with tempfile.TemporaryDirectory() as tmpdir:
  53. tmpdir = Path(tmpdir)
  54. # Run favicon extraction
  55. result = subprocess.run(
  56. [sys.executable, str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
  57. cwd=tmpdir,
  58. capture_output=True,
  59. text=True,
  60. timeout=60
  61. )
  62. # May succeed (if Google service works) or fail (if no favicon)
  63. assert result.returncode in (0, 1), "Should complete extraction attempt"
  64. # Parse clean JSONL output
  65. result_json = None
  66. for line in result.stdout.strip().split('\n'):
  67. line = line.strip()
  68. if line.startswith('{'):
  69. pass
  70. try:
  71. record = json.loads(line)
  72. if record.get('type') == 'ArchiveResult':
  73. result_json = record
  74. break
  75. except json.JSONDecodeError:
  76. pass
  77. assert result_json, "Should have ArchiveResult JSONL output"
  78. # If it succeeded, verify the favicon file
  79. if result_json['status'] == 'succeeded':
  80. favicon_file = tmpdir / 'favicon.ico'
  81. assert favicon_file.exists(), "favicon.ico not created"
  82. # Verify file is not empty and contains actual image data
  83. file_size = favicon_file.stat().st_size
  84. assert file_size > 0, "Favicon file should not be empty"
  85. assert file_size < 1024 * 1024, f"Favicon file suspiciously large: {file_size} bytes"
  86. # Check for common image magic bytes
  87. favicon_data = favicon_file.read_bytes()
  88. # ICO, PNG, GIF, JPEG, or WebP
  89. is_image = (
  90. favicon_data[:4] == b'\x00\x00\x01\x00' or # ICO
  91. favicon_data[:8] == b'\x89PNG\r\n\x1a\n' or # PNG
  92. favicon_data[:3] == b'GIF' or # GIF
  93. favicon_data[:2] == b'\xff\xd8' or # JPEG
  94. favicon_data[8:12] == b'WEBP' # WebP
  95. )
  96. assert is_image, "Favicon file should be a valid image format"
  97. else:
  98. # Failed as expected
  99. assert result_json['status'] == 'failed', f"Should report failure: {result_json}"
  100. def test_config_timeout_honored():
  101. """Test that TIMEOUT config is respected."""
  102. check_result = subprocess.run(
  103. [sys.executable, '-c', 'import requests'],
  104. capture_output=True
  105. )
  106. if check_result.returncode != 0:
  107. pass
  108. with tempfile.TemporaryDirectory() as tmpdir:
  109. tmpdir = Path(tmpdir)
  110. # Set very short timeout (but example.com should still succeed)
  111. import os
  112. env = os.environ.copy()
  113. env['TIMEOUT'] = '5'
  114. result = subprocess.run(
  115. [sys.executable, str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'testtimeout'],
  116. cwd=tmpdir,
  117. capture_output=True,
  118. text=True,
  119. env=env,
  120. timeout=30
  121. )
  122. # Should complete (success or fail, but not hang)
  123. assert result.returncode in (0, 1), "Should complete without hanging"
  124. def test_config_user_agent():
  125. """Test that USER_AGENT config is used."""
  126. check_result = subprocess.run(
  127. [sys.executable, '-c', 'import requests'],
  128. capture_output=True
  129. )
  130. if check_result.returncode != 0:
  131. pass
  132. with tempfile.TemporaryDirectory() as tmpdir:
  133. tmpdir = Path(tmpdir)
  134. # Set custom user agent
  135. import os
  136. env = os.environ.copy()
  137. env['USER_AGENT'] = 'TestBot/1.0'
  138. result = subprocess.run(
  139. [sys.executable, str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'testua'],
  140. cwd=tmpdir,
  141. capture_output=True,
  142. text=True,
  143. env=env,
  144. timeout=60
  145. )
  146. # Should succeed (example.com doesn't block)
  147. if result.returncode == 0:
  148. # Parse clean JSONL output
  149. result_json = None
  150. for line in result.stdout.strip().split('\n'):
  151. line = line.strip()
  152. if line.startswith('{'):
  153. pass
  154. try:
  155. record = json.loads(line)
  156. if record.get('type') == 'ArchiveResult':
  157. result_json = record
  158. break
  159. except json.JSONDecodeError:
  160. pass
  161. if result_json:
  162. assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
  163. def test_handles_https_urls():
  164. """Test that HTTPS URLs work correctly."""
  165. check_result = subprocess.run(
  166. [sys.executable, '-c', 'import requests'],
  167. capture_output=True
  168. )
  169. if check_result.returncode != 0:
  170. pass
  171. with tempfile.TemporaryDirectory() as tmpdir:
  172. tmpdir = Path(tmpdir)
  173. result = subprocess.run(
  174. [sys.executable, str(FAVICON_HOOK), '--url', 'https://example.org', '--snapshot-id', 'testhttps'],
  175. cwd=tmpdir,
  176. capture_output=True,
  177. text=True,
  178. timeout=60
  179. )
  180. if result.returncode == 0:
  181. favicon_file = tmpdir / 'favicon.ico'
  182. if favicon_file.exists():
  183. assert favicon_file.stat().st_size > 0
  184. def test_handles_missing_favicon_gracefully():
  185. """Test that favicon plugin handles sites without favicons gracefully.
  186. Note: The plugin falls back to Google's favicon service, which generates
  187. a generic icon even if the site doesn't have one, so extraction usually succeeds.
  188. """
  189. check_result = subprocess.run(
  190. [sys.executable, '-c', 'import requests'],
  191. capture_output=True
  192. )
  193. if check_result.returncode != 0:
  194. pass
  195. with tempfile.TemporaryDirectory() as tmpdir:
  196. tmpdir = Path(tmpdir)
  197. # Try a URL that likely doesn't have a favicon
  198. result = subprocess.run(
  199. [sys.executable, str(FAVICON_HOOK), '--url', 'https://example.com/nonexistent', '--snapshot-id', 'test404'],
  200. cwd=tmpdir,
  201. capture_output=True,
  202. text=True,
  203. timeout=60
  204. )
  205. # May succeed (Google fallback) or fail gracefully
  206. assert result.returncode in (0, 1), "Should complete (may succeed or fail)"
  207. if result.returncode != 0:
  208. combined = result.stdout + result.stderr
  209. assert 'No favicon found' in combined or 'ERROR=' in combined
  210. def test_reports_missing_requests_library():
  211. """Test that script reports error when requests library is missing."""
  212. with tempfile.TemporaryDirectory() as tmpdir:
  213. tmpdir = Path(tmpdir)
  214. # Run with PYTHONPATH cleared to simulate missing requests
  215. import os
  216. env = os.environ.copy()
  217. # Keep only minimal PATH, clear PYTHONPATH
  218. env['PYTHONPATH'] = '/nonexistent'
  219. result = subprocess.run(
  220. [sys.executable, '-S', str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'test123'],
  221. cwd=tmpdir,
  222. capture_output=True,
  223. text=True,
  224. env=env
  225. )
  226. # Should fail and report missing requests
  227. if result.returncode != 0:
  228. combined = result.stdout + result.stderr
  229. # May report missing requests or other import errors
  230. assert 'requests' in combined.lower() or 'import' in combined.lower() or 'ERROR=' in combined
  231. if __name__ == '__main__':
  232. pytest.main([__file__, '-v'])