test_modalcloser.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454
  1. """
  2. Integration tests for modalcloser plugin
  3. Tests verify:
  4. 1. Hook script exists
  5. 2. Dependencies installed via chrome validation hooks
  6. 3. Verify deps with abx-pkg
  7. 4. MODALCLOSER_ENABLED=False skips without JSONL
  8. 5. Fails gracefully when no chrome session exists
  9. 6. Background script runs and handles SIGTERM correctly
  10. 7. Config options work (timeout, poll interval)
  11. 8. Live test: hides cookie consent on filmin.es
  12. """
  13. import json
  14. import os
  15. import signal
  16. import subprocess
  17. import time
  18. import tempfile
  19. from pathlib import Path
  20. import pytest
  21. # Import shared Chrome test helpers
  22. from archivebox.plugins.chrome.tests.chrome_test_helpers import (
  23. get_test_env,
  24. chrome_session,
  25. )
  26. PLUGIN_DIR = Path(__file__).parent.parent
  27. MODALCLOSER_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_modalcloser.*'), None)
  28. TEST_URL = 'https://www.singsing.movie/'
  29. COOKIE_CONSENT_TEST_URL = 'https://www.filmin.es/'
  30. def test_hook_script_exists():
  31. """Verify on_Snapshot hook exists."""
  32. assert MODALCLOSER_HOOK is not None, "Modalcloser hook not found"
  33. assert MODALCLOSER_HOOK.exists(), f"Hook not found: {MODALCLOSER_HOOK}"
  34. def test_verify_deps_with_abx_pkg():
  35. """Verify dependencies are available via abx-pkg after hook installation."""
  36. from abx_pkg import Binary, EnvProvider
  37. EnvProvider.model_rebuild()
  38. # Verify node is available
  39. node_binary = Binary(name='node', binproviders=[EnvProvider()])
  40. node_loaded = node_binary.load()
  41. assert node_loaded and node_loaded.abspath, "Node.js required for modalcloser plugin"
  42. def test_config_modalcloser_disabled_skips():
  43. """Test that MODALCLOSER_ENABLED=False exits without emitting JSONL."""
  44. with tempfile.TemporaryDirectory() as tmpdir:
  45. tmpdir = Path(tmpdir)
  46. env = get_test_env()
  47. env['MODALCLOSER_ENABLED'] = 'False'
  48. result = subprocess.run(
  49. ['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-disabled'],
  50. cwd=tmpdir,
  51. capture_output=True,
  52. text=True,
  53. env=env,
  54. timeout=30
  55. )
  56. assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
  57. assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
  58. # Should NOT emit any JSONL
  59. jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
  60. assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, got: {jsonl_lines}"
  61. def test_fails_gracefully_without_chrome_session():
  62. """Test that hook fails gracefully when no chrome session exists."""
  63. with tempfile.TemporaryDirectory() as tmpdir:
  64. tmpdir = Path(tmpdir)
  65. modalcloser_dir = tmpdir / 'snapshot' / 'modalcloser'
  66. modalcloser_dir.mkdir(parents=True, exist_ok=True)
  67. result = subprocess.run(
  68. ['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-no-chrome'],
  69. cwd=modalcloser_dir,
  70. capture_output=True,
  71. text=True,
  72. env=get_test_env(),
  73. timeout=30
  74. )
  75. # Should fail (exit 1) when no chrome session
  76. assert result.returncode != 0, "Should fail when no chrome session exists"
  77. # Error could be about chrome/CDP not found, or puppeteer module missing
  78. err_lower = result.stderr.lower()
  79. assert any(x in err_lower for x in ['chrome', 'cdp', 'puppeteer', 'module']), \
  80. f"Should mention chrome/CDP/puppeteer in error: {result.stderr}"
  81. def test_background_script_handles_sigterm():
  82. """Test that background script runs and handles SIGTERM correctly."""
  83. with tempfile.TemporaryDirectory() as tmpdir:
  84. modalcloser_process = None
  85. try:
  86. with chrome_session(
  87. Path(tmpdir),
  88. crawl_id='test-modalcloser',
  89. snapshot_id='snap-modalcloser',
  90. test_url=TEST_URL,
  91. ) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
  92. # Create modalcloser output directory (sibling to chrome)
  93. modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
  94. modalcloser_dir.mkdir()
  95. # Run modalcloser as background process (use env from setup_chrome_session)
  96. env['MODALCLOSER_POLL_INTERVAL'] = '200' # Faster polling for test
  97. modalcloser_process = subprocess.Popen(
  98. ['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-modalcloser'],
  99. cwd=str(modalcloser_dir),
  100. stdout=subprocess.PIPE,
  101. stderr=subprocess.PIPE,
  102. text=True,
  103. env=env
  104. )
  105. # Let it run for a bit
  106. time.sleep(2)
  107. # Verify it's still running (background script)
  108. assert modalcloser_process.poll() is None, "Modalcloser should still be running as background process"
  109. # Send SIGTERM
  110. modalcloser_process.send_signal(signal.SIGTERM)
  111. stdout, stderr = modalcloser_process.communicate(timeout=5)
  112. assert modalcloser_process.returncode == 0, f"Should exit 0 on SIGTERM: {stderr}"
  113. # Parse JSONL output
  114. result_json = None
  115. for line in stdout.strip().split('\n'):
  116. line = line.strip()
  117. if line.startswith('{'):
  118. try:
  119. record = json.loads(line)
  120. if record.get('type') == 'ArchiveResult':
  121. result_json = record
  122. break
  123. except json.JSONDecodeError:
  124. pass
  125. assert result_json is not None, f"Should have ArchiveResult JSONL output. Stdout: {stdout}"
  126. assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
  127. # Verify output_str format
  128. output_str = result_json.get('output_str', '')
  129. assert 'modal' in output_str.lower() or 'dialog' in output_str.lower(), \
  130. f"output_str should mention modals/dialogs: {output_str}"
  131. # Verify no files created in output directory
  132. output_files = list(modalcloser_dir.iterdir())
  133. assert len(output_files) == 0, f"Should not create any files, but found: {output_files}"
  134. finally:
  135. if modalcloser_process and modalcloser_process.poll() is None:
  136. modalcloser_process.kill()
  137. def test_dialog_handler_logs_dialogs():
  138. """Test that dialog handler is set up correctly."""
  139. with tempfile.TemporaryDirectory() as tmpdir:
  140. modalcloser_process = None
  141. try:
  142. with chrome_session(
  143. Path(tmpdir),
  144. crawl_id='test-dialog',
  145. snapshot_id='snap-dialog',
  146. test_url=TEST_URL,
  147. ) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
  148. modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
  149. modalcloser_dir.mkdir()
  150. # Use env from setup_chrome_session
  151. env['MODALCLOSER_TIMEOUT'] = '100' # Fast timeout for test
  152. env['MODALCLOSER_POLL_INTERVAL'] = '200'
  153. modalcloser_process = subprocess.Popen(
  154. ['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-dialog'],
  155. cwd=str(modalcloser_dir),
  156. stdout=subprocess.PIPE,
  157. stderr=subprocess.PIPE,
  158. text=True,
  159. env=env
  160. )
  161. # Let it run briefly
  162. time.sleep(1.5)
  163. # Verify it's running
  164. assert modalcloser_process.poll() is None, "Should be running"
  165. # Check stderr for "listening" message
  166. # Note: Can't read stderr while process is running without blocking,
  167. # so we just verify it exits cleanly
  168. modalcloser_process.send_signal(signal.SIGTERM)
  169. stdout, stderr = modalcloser_process.communicate(timeout=5)
  170. assert 'listening' in stderr.lower() or 'modalcloser' in stderr.lower(), \
  171. f"Should log startup message: {stderr}"
  172. assert modalcloser_process.returncode == 0, f"Should exit cleanly: {stderr}"
  173. finally:
  174. if modalcloser_process and modalcloser_process.poll() is None:
  175. modalcloser_process.kill()
  176. def test_config_poll_interval():
  177. """Test that MODALCLOSER_POLL_INTERVAL config is respected."""
  178. with tempfile.TemporaryDirectory() as tmpdir:
  179. chrome_launch_process = None
  180. chrome_pid = None
  181. modalcloser_process = None
  182. try:
  183. with chrome_session(
  184. Path(tmpdir),
  185. crawl_id='test-poll',
  186. snapshot_id='snap-poll',
  187. test_url=TEST_URL,
  188. ) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
  189. modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
  190. modalcloser_dir.mkdir()
  191. # Set very short poll interval (use env from setup_chrome_session)
  192. env['MODALCLOSER_POLL_INTERVAL'] = '100' # 100ms
  193. modalcloser_process = subprocess.Popen(
  194. ['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-poll'],
  195. cwd=str(modalcloser_dir),
  196. stdout=subprocess.PIPE,
  197. stderr=subprocess.PIPE,
  198. text=True,
  199. env=env
  200. )
  201. # Run for short time
  202. time.sleep(1)
  203. # Should still be running
  204. assert modalcloser_process.poll() is None, "Should still be running"
  205. # Clean exit
  206. modalcloser_process.send_signal(signal.SIGTERM)
  207. stdout, stderr = modalcloser_process.communicate(timeout=5)
  208. assert modalcloser_process.returncode == 0, f"Should exit 0: {stderr}"
  209. # Verify JSONL output exists
  210. result_json = None
  211. for line in stdout.strip().split('\n'):
  212. if line.strip().startswith('{'):
  213. try:
  214. record = json.loads(line)
  215. if record.get('type') == 'ArchiveResult':
  216. result_json = record
  217. break
  218. except json.JSONDecodeError:
  219. pass
  220. assert result_json is not None, "Should have JSONL output"
  221. assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
  222. finally:
  223. if modalcloser_process and modalcloser_process.poll() is None:
  224. modalcloser_process.kill()
  225. def test_hides_cookie_consent_on_filmin():
  226. """Live test: verify modalcloser hides cookie consent popup on filmin.es."""
  227. # Create a test script that uses puppeteer directly
  228. test_script = '''
  229. const puppeteer = require('puppeteer-core');
  230. async function closeModals(page) {
  231. return page.evaluate(() => {
  232. let closed = 0;
  233. // Bootstrap 4/5
  234. if (typeof bootstrap !== 'undefined' && bootstrap.Modal) {
  235. document.querySelectorAll('.modal.show').forEach(el => {
  236. try {
  237. const modal = bootstrap.Modal.getInstance(el);
  238. if (modal) { modal.hide(); closed++; }
  239. } catch (e) {}
  240. });
  241. }
  242. // Bootstrap 3 / jQuery
  243. if (typeof jQuery !== 'undefined' && jQuery.fn && jQuery.fn.modal) {
  244. try {
  245. const $modals = jQuery('.modal.in, .modal.show');
  246. if ($modals.length > 0) {
  247. $modals.modal('hide');
  248. closed += $modals.length;
  249. }
  250. } catch (e) {}
  251. }
  252. // Generic selectors including cookie consent
  253. const genericSelectors = [
  254. // CookieYes (cky) specific selectors
  255. '.cky-consent-container',
  256. '.cky-popup-center',
  257. '.cky-overlay',
  258. '.cky-modal',
  259. '#ckyPreferenceCenter',
  260. // Generic cookie consent
  261. '#cookie-consent', '.cookie-banner', '.cookie-notice',
  262. '#cookieConsent', '.cookie-consent', '.cookies-banner',
  263. '[class*="cookie"][class*="banner"]',
  264. '[class*="cookie"][class*="notice"]',
  265. '[class*="consent"]',
  266. '[class*="gdpr"]',
  267. '.modal-overlay', '.modal-backdrop',
  268. '.popup-overlay', '.newsletter-popup',
  269. ];
  270. genericSelectors.forEach(selector => {
  271. try {
  272. document.querySelectorAll(selector).forEach(el => {
  273. const style = window.getComputedStyle(el);
  274. if (style.display === 'none' || style.visibility === 'hidden') return;
  275. el.style.display = 'none';
  276. el.style.visibility = 'hidden';
  277. el.style.opacity = '0';
  278. el.style.pointerEvents = 'none';
  279. closed++;
  280. });
  281. } catch (e) {}
  282. });
  283. document.body.style.overflow = '';
  284. document.body.classList.remove('modal-open', 'overflow-hidden', 'no-scroll');
  285. return closed;
  286. });
  287. }
  288. async function main() {
  289. const browser = await puppeteer.launch({
  290. headless: 'new',
  291. executablePath: process.env.CHROME_BINARY || '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
  292. args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-blink-features=AutomationControlled']
  293. });
  294. const page = await browser.newPage();
  295. // Set real user agent to bypass headless detection
  296. await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
  297. await page.setViewport({ width: 1440, height: 900 });
  298. console.error('Navigating to filmin.es...');
  299. await page.goto('https://www.filmin.es/', { waitUntil: 'networkidle2', timeout: 30000 });
  300. // Wait for cookie consent to appear
  301. await new Promise(r => setTimeout(r, 3000));
  302. // Check BEFORE
  303. const before = await page.evaluate(() => {
  304. const el = document.querySelector('.cky-consent-container');
  305. if (!el) return { found: false };
  306. const style = window.getComputedStyle(el);
  307. return { found: true, display: style.display, visibility: style.visibility };
  308. });
  309. console.error('Before:', JSON.stringify(before));
  310. // Run modal closer
  311. const closed = await closeModals(page);
  312. console.error('Closed:', closed, 'modals');
  313. // Check AFTER
  314. const after = await page.evaluate(() => {
  315. const el = document.querySelector('.cky-consent-container');
  316. if (!el) return { found: false };
  317. const style = window.getComputedStyle(el);
  318. return { found: true, display: style.display, visibility: style.visibility };
  319. });
  320. console.error('After:', JSON.stringify(after));
  321. await browser.close();
  322. // Output result as JSON for Python to parse
  323. const result = {
  324. before_found: before.found,
  325. before_visible: before.found && before.display !== 'none' && before.visibility !== 'hidden',
  326. after_hidden: !after.found || after.display === 'none' || after.visibility === 'hidden',
  327. modals_closed: closed
  328. };
  329. console.log(JSON.stringify(result));
  330. }
  331. main().catch(e => {
  332. console.error('Error:', e.message);
  333. process.exit(1);
  334. });
  335. '''
  336. with tempfile.TemporaryDirectory() as tmpdir:
  337. tmpdir = Path(tmpdir)
  338. script_path = tmpdir / 'test_cookie_consent.js'
  339. script_path.write_text(test_script)
  340. env = get_test_env()
  341. result = subprocess.run(
  342. ['node', str(script_path)],
  343. cwd=tmpdir,
  344. capture_output=True,
  345. text=True,
  346. env=env,
  347. timeout=60
  348. )
  349. print(f"stderr: {result.stderr}")
  350. print(f"stdout: {result.stdout}")
  351. assert result.returncode == 0, f"Test script failed: {result.stderr}"
  352. # Parse the JSON output
  353. output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
  354. assert len(output_lines) > 0, f"No JSON output from test script. stdout: {result.stdout}"
  355. test_result = json.loads(output_lines[-1])
  356. # The cookie consent should have been found initially (or page changed)
  357. # After running closeModals, it should be hidden
  358. if test_result['before_found']:
  359. assert test_result['after_hidden'], \
  360. f"Cookie consent should be hidden after modalcloser. Result: {test_result}"
  361. assert test_result['modals_closed'] > 0, \
  362. f"Should have closed at least one modal. Result: {test_result}"
  363. else:
  364. # Page may have changed, just verify no errors
  365. print("Cookie consent element not found (page may have changed)")
  366. if __name__ == '__main__':
  367. pytest.main([__file__, '-v'])