test_ublock.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725
  1. """
  2. Unit tests for ublock plugin
  3. Tests invoke the plugin hook as an external process and verify outputs/side effects.
  4. """
  5. import json
  6. import os
  7. import subprocess
  8. import tempfile
  9. from pathlib import Path
  10. import pytest
  11. from archivebox.plugins.chrome.tests.chrome_test_helpers import (
  12. setup_test_env,
  13. get_test_env,
  14. launch_chromium_session,
  15. kill_chromium_session,
  16. CHROME_LAUNCH_HOOK,
  17. PLUGINS_ROOT,
  18. )
  19. PLUGIN_DIR = Path(__file__).parent.parent
  20. INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_ublock_extension.*'), None)
  21. def test_install_script_exists():
  22. """Verify install script exists"""
  23. assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}"
  24. def test_extension_metadata():
  25. """Test that uBlock Origin extension has correct metadata"""
  26. with tempfile.TemporaryDirectory() as tmpdir:
  27. env = os.environ.copy()
  28. env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
  29. result = subprocess.run(
  30. ["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"],
  31. capture_output=True,
  32. text=True,
  33. env=env
  34. )
  35. assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}"
  36. metadata = json.loads(result.stdout)
  37. assert metadata["webstore_id"] == "cjpalhdlnbpafiamejdnhcphjbkeiagm"
  38. assert metadata["name"] == "ublock"
  39. def test_install_creates_cache():
  40. """Test that install creates extension cache"""
  41. with tempfile.TemporaryDirectory() as tmpdir:
  42. ext_dir = Path(tmpdir) / "chrome_extensions"
  43. ext_dir.mkdir(parents=True)
  44. env = os.environ.copy()
  45. env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
  46. result = subprocess.run(
  47. ["node", str(INSTALL_SCRIPT)],
  48. capture_output=True,
  49. text=True,
  50. env=env,
  51. timeout=120 # uBlock is large, may take longer to download
  52. )
  53. # Check output mentions installation
  54. assert "uBlock" in result.stdout or "ublock" in result.stdout
  55. # Check cache file was created
  56. cache_file = ext_dir / "ublock.extension.json"
  57. assert cache_file.exists(), "Cache file should be created"
  58. # Verify cache content
  59. cache_data = json.loads(cache_file.read_text())
  60. assert cache_data["webstore_id"] == "cjpalhdlnbpafiamejdnhcphjbkeiagm"
  61. assert cache_data["name"] == "ublock"
  62. def test_install_twice_uses_cache():
  63. """Test that running install twice uses existing cache on second run"""
  64. with tempfile.TemporaryDirectory() as tmpdir:
  65. ext_dir = Path(tmpdir) / "chrome_extensions"
  66. ext_dir.mkdir(parents=True)
  67. env = os.environ.copy()
  68. env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
  69. # First install - downloads the extension
  70. result1 = subprocess.run(
  71. ["node", str(INSTALL_SCRIPT)],
  72. capture_output=True,
  73. text=True,
  74. env=env,
  75. timeout=120 # uBlock is large
  76. )
  77. assert result1.returncode == 0, f"First install failed: {result1.stderr}"
  78. # Verify cache was created
  79. cache_file = ext_dir / "ublock.extension.json"
  80. assert cache_file.exists(), "Cache file should exist after first install"
  81. # Second install - should use cache and be faster
  82. result2 = subprocess.run(
  83. ["node", str(INSTALL_SCRIPT)],
  84. capture_output=True,
  85. text=True,
  86. env=env,
  87. timeout=30
  88. )
  89. assert result2.returncode == 0, f"Second install failed: {result2.stderr}"
  90. # Second run should mention cache reuse
  91. assert "already installed" in result2.stdout or "cache" in result2.stdout.lower() or result2.returncode == 0
  92. def test_no_configuration_required():
  93. """Test that uBlock Origin works without configuration"""
  94. with tempfile.TemporaryDirectory() as tmpdir:
  95. ext_dir = Path(tmpdir) / "chrome_extensions"
  96. ext_dir.mkdir(parents=True)
  97. env = os.environ.copy()
  98. env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
  99. # No API keys needed - works with default filter lists
  100. result = subprocess.run(
  101. ["node", str(INSTALL_SCRIPT)],
  102. capture_output=True,
  103. text=True,
  104. env=env,
  105. timeout=120
  106. )
  107. # Should not require any API keys
  108. combined_output = result.stdout + result.stderr
  109. assert "API" not in combined_output or result.returncode == 0
  110. def test_large_extension_size():
  111. """Test that uBlock Origin is downloaded successfully despite large size"""
  112. with tempfile.TemporaryDirectory() as tmpdir:
  113. ext_dir = Path(tmpdir) / "chrome_extensions"
  114. ext_dir.mkdir(parents=True)
  115. env = os.environ.copy()
  116. env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
  117. result = subprocess.run(
  118. ["node", str(INSTALL_SCRIPT)],
  119. capture_output=True,
  120. text=True,
  121. env=env,
  122. timeout=120
  123. )
  124. # If extension was downloaded, verify it's substantial size
  125. crx_file = ext_dir / "cjpalhdlnbpafiamejdnhcphjbkeiagm__ublock.crx"
  126. if crx_file.exists():
  127. # uBlock Origin with filter lists is typically 2-5 MB
  128. size_bytes = crx_file.stat().st_size
  129. assert size_bytes > 1_000_000, f"uBlock Origin should be > 1MB, got {size_bytes} bytes"
  130. def check_ad_blocking(cdp_url: str, test_url: str, env: dict, script_dir: Path) -> dict:
  131. """Check ad blocking effectiveness by counting ad elements on page.
  132. Returns dict with:
  133. - adElementsFound: int - number of ad-related elements found
  134. - adElementsVisible: int - number of visible ad elements
  135. - blockedRequests: int - number of blocked network requests (ads/trackers)
  136. - totalRequests: int - total network requests made
  137. - percentBlocked: int - percentage of ad elements hidden (0-100)
  138. """
  139. test_script = f'''
  140. if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
  141. const puppeteer = require('puppeteer-core');
  142. (async () => {{
  143. const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
  144. const page = await browser.newPage();
  145. await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
  146. await page.setViewport({{ width: 1440, height: 900 }});
  147. // Track network requests
  148. let blockedRequests = 0;
  149. let totalRequests = 0;
  150. const adDomains = ['doubleclick', 'googlesyndication', 'googleadservices', 'facebook.com/tr',
  151. 'analytics', 'adservice', 'advertising', 'taboola', 'outbrain', 'criteo',
  152. 'amazon-adsystem', 'ads.yahoo', 'gemini.yahoo', 'yimg.com/cv/', 'beap.gemini'];
  153. page.on('request', request => {{
  154. totalRequests++;
  155. const url = request.url().toLowerCase();
  156. if (adDomains.some(d => url.includes(d))) {{
  157. // This is an ad request
  158. }}
  159. }});
  160. page.on('requestfailed', request => {{
  161. const url = request.url().toLowerCase();
  162. if (adDomains.some(d => url.includes(d))) {{
  163. blockedRequests++;
  164. }}
  165. }});
  166. console.error('Navigating to {test_url}...');
  167. await page.goto('{test_url}', {{ waitUntil: 'domcontentloaded', timeout: 60000 }});
  168. // Wait for page to fully render and ads to load
  169. await new Promise(r => setTimeout(r, 5000));
  170. // Check for ad elements in the DOM
  171. const result = await page.evaluate(() => {{
  172. // Common ad-related selectors
  173. const adSelectors = [
  174. // Generic ad containers
  175. '[class*="ad-"]', '[class*="ad_"]', '[class*="-ad"]', '[class*="_ad"]',
  176. '[id*="ad-"]', '[id*="ad_"]', '[id*="-ad"]', '[id*="_ad"]',
  177. '[class*="advertisement"]', '[id*="advertisement"]',
  178. '[class*="sponsored"]', '[id*="sponsored"]',
  179. // Google ads
  180. 'ins.adsbygoogle', '[data-ad-client]', '[data-ad-slot]',
  181. // Yahoo specific
  182. '[class*="gemini"]', '[data-beacon]', '[class*="native-ad"]',
  183. '[class*="stream-ad"]', '[class*="LDRB"]', '[class*="ntv-ad"]',
  184. // iframes (often ads)
  185. 'iframe[src*="ad"]', 'iframe[src*="doubleclick"]', 'iframe[src*="googlesyndication"]',
  186. // Common ad sizes
  187. '[style*="300px"][style*="250px"]', '[style*="728px"][style*="90px"]',
  188. '[style*="160px"][style*="600px"]', '[style*="320px"][style*="50px"]',
  189. ];
  190. let adElementsFound = 0;
  191. let adElementsVisible = 0;
  192. for (const selector of adSelectors) {{
  193. try {{
  194. const elements = document.querySelectorAll(selector);
  195. for (const el of elements) {{
  196. adElementsFound++;
  197. const style = window.getComputedStyle(el);
  198. const rect = el.getBoundingClientRect();
  199. const isVisible = style.display !== 'none' &&
  200. style.visibility !== 'hidden' &&
  201. style.opacity !== '0' &&
  202. rect.width > 0 && rect.height > 0;
  203. if (isVisible) {{
  204. adElementsVisible++;
  205. }}
  206. }}
  207. }} catch (e) {{
  208. // Invalid selector, skip
  209. }}
  210. }}
  211. return {{
  212. adElementsFound,
  213. adElementsVisible,
  214. pageTitle: document.title
  215. }};
  216. }});
  217. result.blockedRequests = blockedRequests;
  218. result.totalRequests = totalRequests;
  219. // Calculate how many ad elements were hidden (found but not visible)
  220. const hiddenAds = result.adElementsFound - result.adElementsVisible;
  221. result.percentBlocked = result.adElementsFound > 0
  222. ? Math.round((hiddenAds / result.adElementsFound) * 100)
  223. : 0;
  224. console.error('Ad blocking result:', JSON.stringify(result));
  225. browser.disconnect();
  226. console.log(JSON.stringify(result));
  227. }})();
  228. '''
  229. script_path = script_dir / 'check_ads.js'
  230. script_path.write_text(test_script)
  231. result = subprocess.run(
  232. ['node', str(script_path)],
  233. cwd=str(script_dir),
  234. capture_output=True,
  235. text=True,
  236. env=env,
  237. timeout=90
  238. )
  239. if result.returncode != 0:
  240. raise RuntimeError(f"Ad check script failed: {result.stderr}")
  241. output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
  242. if not output_lines:
  243. raise RuntimeError(f"No JSON output from ad check: {result.stdout}\nstderr: {result.stderr}")
  244. return json.loads(output_lines[-1])
  245. # Test URL: Yahoo has many ads that uBlock should block (no mocks)
  246. TEST_URL = 'https://www.yahoo.com/'
  247. def test_extension_loads_in_chromium():
  248. """Verify uBlock extension loads in Chromium by visiting its dashboard page.
  249. Uses Chromium with --load-extension to load the extension, then navigates
  250. to chrome-extension://<id>/dashboard.html and checks that "uBlock" appears
  251. in the page content.
  252. """
  253. import signal
  254. import time
  255. print("[test] Starting test_extension_loads_in_chromium", flush=True)
  256. with tempfile.TemporaryDirectory() as tmpdir:
  257. tmpdir = Path(tmpdir)
  258. print(f"[test] tmpdir={tmpdir}", flush=True)
  259. # Set up isolated env with proper directory structure
  260. env = setup_test_env(tmpdir)
  261. env.setdefault('CHROME_HEADLESS', 'true')
  262. print(f"[test] DATA_DIR={env.get('DATA_DIR')}", flush=True)
  263. print(f"[test] CHROME_BINARY={env.get('CHROME_BINARY')}", flush=True)
  264. ext_dir = Path(env['CHROME_EXTENSIONS_DIR'])
  265. # Step 1: Install the uBlock extension
  266. print("[test] Installing uBlock extension...", flush=True)
  267. result = subprocess.run(
  268. ['node', str(INSTALL_SCRIPT)],
  269. capture_output=True,
  270. text=True,
  271. env=env,
  272. timeout=5
  273. )
  274. print(f"[test] Extension install rc={result.returncode}", flush=True)
  275. assert result.returncode == 0, f"Extension install failed: {result.stderr}"
  276. # Verify extension cache was created
  277. cache_file = ext_dir / 'ublock.extension.json'
  278. assert cache_file.exists(), "Extension cache not created"
  279. ext_data = json.loads(cache_file.read_text())
  280. print(f"[test] Extension installed: {ext_data.get('name')} v{ext_data.get('version')}", flush=True)
  281. # Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
  282. print(f"[test] NODE_MODULES_DIR={env.get('NODE_MODULES_DIR')}", flush=True)
  283. print(f"[test] puppeteer-core exists: {(Path(env['NODE_MODULES_DIR']) / 'puppeteer-core').exists()}", flush=True)
  284. print("[test] Launching Chromium...", flush=True)
  285. # Launch Chromium in crawls directory
  286. crawl_id = 'test-ublock'
  287. crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id
  288. crawl_dir.mkdir(parents=True, exist_ok=True)
  289. chrome_dir = crawl_dir / 'chrome'
  290. chrome_dir.mkdir(parents=True, exist_ok=True)
  291. env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
  292. chrome_launch_process = subprocess.Popen(
  293. ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
  294. cwd=str(chrome_dir),
  295. stdout=subprocess.PIPE,
  296. stderr=subprocess.PIPE,
  297. text=True,
  298. env=env
  299. )
  300. print("[test] Chrome hook started, waiting for CDP...", flush=True)
  301. # Wait for Chromium to launch and CDP URL to be available
  302. cdp_url = None
  303. import select
  304. for i in range(20):
  305. poll_result = chrome_launch_process.poll()
  306. if poll_result is not None:
  307. stdout, stderr = chrome_launch_process.communicate()
  308. raise RuntimeError(f"Chromium launch failed (exit={poll_result}):\nStdout: {stdout}\nStderr: {stderr}")
  309. cdp_file = chrome_dir / 'cdp_url.txt'
  310. if cdp_file.exists():
  311. cdp_url = cdp_file.read_text().strip()
  312. print(f"[test] CDP URL found after {i+1} attempts", flush=True)
  313. break
  314. # Read any available stderr
  315. while select.select([chrome_launch_process.stderr], [], [], 0)[0]:
  316. line = chrome_launch_process.stderr.readline()
  317. if not line:
  318. break
  319. print(f"[hook] {line.strip()}", flush=True)
  320. time.sleep(0.3)
  321. assert cdp_url, "Chromium CDP URL not found after 20s"
  322. print(f"[test] Chromium launched with CDP URL: {cdp_url}", flush=True)
  323. print("[test] Reading hook stderr...", flush=True)
  324. # Check what extensions were loaded by chrome hook
  325. extensions_file = chrome_dir / 'extensions.json'
  326. if extensions_file.exists():
  327. loaded_exts = json.loads(extensions_file.read_text())
  328. print(f"Extensions loaded by chrome hook: {[e.get('name') for e in loaded_exts]}")
  329. else:
  330. print("Warning: extensions.json not found")
  331. # Get the unpacked extension ID - Chrome computes this from the path
  332. unpacked_path = ext_data.get('unpacked_path', '')
  333. print(f"[test] Extension unpacked path: {unpacked_path}", flush=True)
  334. print("[test] Running puppeteer test script...", flush=True)
  335. try:
  336. # Step 3: Connect to Chromium and verify extension loads
  337. # First use CDP to get all targets and find extension ID
  338. test_script = f'''
  339. if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
  340. const puppeteer = require('puppeteer-core');
  341. (async () => {{
  342. const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
  343. // Wait for extension to initialize
  344. await new Promise(r => setTimeout(r, 500));
  345. // Use CDP to get all targets including service workers
  346. const pages = await browser.pages();
  347. const page = pages[0] || await browser.newPage();
  348. const client = await page.createCDPSession();
  349. const {{ targetInfos }} = await client.send('Target.getTargets');
  350. console.error('All CDP targets:');
  351. targetInfos.forEach(t => console.error(' -', t.type, t.url.slice(0, 100)));
  352. // Find any chrome-extension:// URLs
  353. const extTargets = targetInfos.filter(t => t.url.startsWith('chrome-extension://'));
  354. console.error('Extension targets:', extTargets.length);
  355. // Filter out built-in extensions
  356. const builtinIds = ['nkeimhogjdpnpccoofpliimaahmaaome', 'fignfifoniblkonapihmkfakmlgkbkcf',
  357. 'ahfgeienlihckogmohjhadlkjgocpleb', 'mhjfbmdgcfjbbpaeojofohoefgiehjai'];
  358. const customExts = extTargets.filter(t => {{
  359. const extId = t.url.split('://')[1].split('/')[0];
  360. return !builtinIds.includes(extId);
  361. }});
  362. if (customExts.length === 0) {{
  363. console.log(JSON.stringify({{ loaded: false, error: 'No custom extension found via CDP' }}));
  364. browser.disconnect();
  365. return;
  366. }}
  367. // Get extension ID from first custom extension
  368. const extId = customExts[0].url.split('://')[1].split('/')[0];
  369. console.error('Found extension ID:', extId);
  370. // Try to load dashboard.html
  371. const newPage = await browser.newPage();
  372. const dashboardUrl = 'chrome-extension://' + extId + '/dashboard.html';
  373. console.error('Loading:', dashboardUrl);
  374. try {{
  375. await newPage.goto(dashboardUrl, {{ waitUntil: 'domcontentloaded', timeout: 15000 }});
  376. const title = await newPage.title();
  377. const content = await newPage.content();
  378. const hasUblock = content.toLowerCase().includes('ublock') || title.toLowerCase().includes('ublock');
  379. console.log(JSON.stringify({{
  380. loaded: true,
  381. extensionId: extId,
  382. pageTitle: title,
  383. hasExtensionName: hasUblock,
  384. contentLength: content.length
  385. }}));
  386. }} catch (e) {{
  387. console.error('Dashboard load failed:', e.message);
  388. console.log(JSON.stringify({{ loaded: true, extensionId: extId, dashboardError: e.message }}));
  389. }}
  390. browser.disconnect();
  391. }})();
  392. '''
  393. script_path = tmpdir / 'test_ublock.js'
  394. script_path.write_text(test_script)
  395. result = subprocess.run(
  396. ['node', str(script_path)],
  397. cwd=str(tmpdir),
  398. capture_output=True,
  399. text=True,
  400. env=env,
  401. timeout=10
  402. )
  403. print(f"stderr: {result.stderr}")
  404. print(f"stdout: {result.stdout}")
  405. assert result.returncode == 0, f"Test failed: {result.stderr}"
  406. output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
  407. assert output_lines, f"No JSON output: {result.stdout}"
  408. test_result = json.loads(output_lines[-1])
  409. assert test_result.get('loaded'), \
  410. f"uBlock extension should be loaded in Chromium. Result: {test_result}"
  411. print(f"Extension loaded successfully: {test_result}")
  412. finally:
  413. # Clean up Chromium
  414. try:
  415. chrome_launch_process.send_signal(signal.SIGTERM)
  416. chrome_launch_process.wait(timeout=5)
  417. except:
  418. pass
  419. chrome_pid_file = chrome_dir / 'chrome.pid'
  420. if chrome_pid_file.exists():
  421. try:
  422. chrome_pid = int(chrome_pid_file.read_text().strip())
  423. os.kill(chrome_pid, signal.SIGKILL)
  424. except (OSError, ValueError):
  425. pass
  426. def test_blocks_ads_on_yahoo_com():
  427. """Live test: verify uBlock Origin blocks ads on yahoo.com (real network).
  428. This test runs TWO browser sessions:
  429. 1. WITHOUT extension - verifies ads are NOT blocked (baseline)
  430. 2. WITH extension - verifies ads ARE blocked
  431. This ensures we're actually testing the extension's effect, not just
  432. that a test page happens to show ads as blocked. No mocks are used.
  433. """
  434. import time
  435. with tempfile.TemporaryDirectory() as tmpdir:
  436. tmpdir = Path(tmpdir)
  437. # Set up isolated env with proper directory structure
  438. env_base = setup_test_env(tmpdir)
  439. env_base['CHROME_HEADLESS'] = 'true'
  440. # ============================================================
  441. # STEP 1: BASELINE - Run WITHOUT extension, verify ads are NOT blocked
  442. # ============================================================
  443. print("\n" + "="*60)
  444. print("STEP 1: BASELINE TEST (no extension)")
  445. print("="*60)
  446. data_dir = Path(env_base['DATA_DIR'])
  447. env_no_ext = env_base.copy()
  448. env_no_ext['CHROME_EXTENSIONS_DIR'] = str(data_dir / 'personas' / 'Default' / 'empty_extensions')
  449. (data_dir / 'personas' / 'Default' / 'empty_extensions').mkdir(parents=True, exist_ok=True)
  450. # Launch baseline Chromium in crawls directory
  451. baseline_crawl_id = 'baseline-no-ext'
  452. baseline_crawl_dir = Path(env_base['CRAWLS_DIR']) / baseline_crawl_id
  453. baseline_crawl_dir.mkdir(parents=True, exist_ok=True)
  454. baseline_chrome_dir = baseline_crawl_dir / 'chrome'
  455. env_no_ext['CRAWL_OUTPUT_DIR'] = str(baseline_crawl_dir)
  456. baseline_process = None
  457. try:
  458. baseline_process, baseline_cdp_url = launch_chromium_session(
  459. env_no_ext, baseline_chrome_dir, baseline_crawl_id
  460. )
  461. print(f"Baseline Chromium launched: {baseline_cdp_url}")
  462. # Wait a moment for browser to be ready
  463. time.sleep(2)
  464. baseline_result = check_ad_blocking(
  465. baseline_cdp_url, TEST_URL, env_no_ext, tmpdir
  466. )
  467. print(f"Baseline result: {baseline_result['adElementsVisible']} visible ads "
  468. f"(found {baseline_result['adElementsFound']} ad elements)")
  469. finally:
  470. if baseline_process:
  471. kill_chromium_session(baseline_process, baseline_chrome_dir)
  472. # Verify baseline shows ads ARE visible (not blocked)
  473. if baseline_result['adElementsFound'] == 0:
  474. pytest.fail(
  475. f"Baseline must find ad elements on {TEST_URL}, but found none. "
  476. f"This test requires a real ad-heavy page."
  477. )
  478. if baseline_result['adElementsVisible'] == 0:
  479. pytest.fail(
  480. f"Baseline must have visible ads on {TEST_URL}, but none were visible. "
  481. f"This likely means another ad blocker is active or network-level blocking is in effect."
  482. )
  483. print(f"\n✓ Baseline confirmed: {baseline_result['adElementsVisible']} visible ads without extension")
  484. # ============================================================
  485. # STEP 2: Install the uBlock extension
  486. # ============================================================
  487. print("\n" + "="*60)
  488. print("STEP 2: INSTALLING EXTENSION")
  489. print("="*60)
  490. ext_dir = Path(env_base['CHROME_EXTENSIONS_DIR'])
  491. result = subprocess.run(
  492. ['node', str(INSTALL_SCRIPT)],
  493. capture_output=True,
  494. text=True,
  495. env=env_base,
  496. timeout=60
  497. )
  498. assert result.returncode == 0, f"Extension install failed: {result.stderr}"
  499. cache_file = ext_dir / 'ublock.extension.json'
  500. assert cache_file.exists(), "Extension cache not created"
  501. ext_data = json.loads(cache_file.read_text())
  502. print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
  503. # ============================================================
  504. # STEP 3: Run WITH extension, verify ads ARE blocked
  505. # ============================================================
  506. print("\n" + "="*60)
  507. print("STEP 3: TEST WITH EXTENSION")
  508. print("="*60)
  509. # Launch extension test Chromium in crawls directory
  510. ext_crawl_id = 'test-with-ext'
  511. ext_crawl_dir = Path(env_base['CRAWLS_DIR']) / ext_crawl_id
  512. ext_crawl_dir.mkdir(parents=True, exist_ok=True)
  513. ext_chrome_dir = ext_crawl_dir / 'chrome'
  514. env_base['CRAWL_OUTPUT_DIR'] = str(ext_crawl_dir)
  515. ext_process = None
  516. try:
  517. ext_process, ext_cdp_url = launch_chromium_session(
  518. env_base, ext_chrome_dir, ext_crawl_id
  519. )
  520. print(f"Extension Chromium launched: {ext_cdp_url}")
  521. # Check that extension was loaded
  522. extensions_file = ext_chrome_dir / 'extensions.json'
  523. if extensions_file.exists():
  524. loaded_exts = json.loads(extensions_file.read_text())
  525. print(f"Extensions loaded: {[e.get('name') for e in loaded_exts]}")
  526. # Verify extension has ID and is initialized
  527. if loaded_exts and loaded_exts[0].get('id'):
  528. ext_id = loaded_exts[0]['id']
  529. print(f"Extension ID: {ext_id}")
  530. # Visit the extension dashboard to ensure it's fully loaded
  531. print("Visiting extension dashboard to verify initialization...")
  532. dashboard_script = f'''
  533. const puppeteer = require('{env_base['NODE_MODULES_DIR']}/puppeteer-core');
  534. (async () => {{
  535. const browser = await puppeteer.connect({{
  536. browserWSEndpoint: '{ext_cdp_url}',
  537. defaultViewport: null
  538. }});
  539. const page = await browser.newPage();
  540. await page.goto('chrome-extension://{ext_id}/dashboard.html', {{ waitUntil: 'domcontentloaded', timeout: 10000 }});
  541. const title = await page.title();
  542. console.log('Dashboard title:', title);
  543. await page.close();
  544. browser.disconnect();
  545. }})();
  546. '''
  547. dash_script_path = tmpdir / 'check_dashboard.js'
  548. dash_script_path.write_text(dashboard_script)
  549. subprocess.run(['node', str(dash_script_path)], capture_output=True, timeout=15, env=env_base)
  550. # Wait longer for extension to fully initialize filters
  551. # On first run, uBlock needs to download filter lists which can take 10-15 seconds
  552. print("Waiting for uBlock filter lists to download and initialize...")
  553. time.sleep(15)
  554. ext_result = check_ad_blocking(
  555. ext_cdp_url, TEST_URL, env_base, tmpdir
  556. )
  557. print(f"Extension result: {ext_result['adElementsVisible']} visible ads "
  558. f"(found {ext_result['adElementsFound']} ad elements)")
  559. finally:
  560. if ext_process:
  561. kill_chromium_session(ext_process, ext_chrome_dir)
  562. # ============================================================
  563. # STEP 4: Compare results
  564. # ============================================================
  565. print("\n" + "="*60)
  566. print("STEP 4: COMPARISON")
  567. print("="*60)
  568. print(f"Baseline (no extension): {baseline_result['adElementsVisible']} visible ads")
  569. print(f"With extension: {ext_result['adElementsVisible']} visible ads")
  570. # Calculate reduction in visible ads
  571. ads_blocked = baseline_result['adElementsVisible'] - ext_result['adElementsVisible']
  572. reduction_percent = (ads_blocked / baseline_result['adElementsVisible'] * 100) if baseline_result['adElementsVisible'] > 0 else 0
  573. print(f"Reduction: {ads_blocked} fewer visible ads ({reduction_percent:.0f}% reduction)")
  574. # Extension should significantly reduce visible ads
  575. assert ext_result['adElementsVisible'] < baseline_result['adElementsVisible'], \
  576. f"uBlock should reduce visible ads.\n" \
  577. f"Baseline: {baseline_result['adElementsVisible']} visible ads\n" \
  578. f"With extension: {ext_result['adElementsVisible']} visible ads\n" \
  579. f"Expected fewer ads with extension."
  580. # Ensure uBlock actually blocks at least some ad/track requests
  581. assert ext_result['blockedRequests'] > 0, \
  582. "uBlock should block at least one ad/track request on yahoo.com"
  583. # Extension should block at least 20% of ads (was consistently blocking 5-13% without proper init time)
  584. assert reduction_percent >= 20, \
  585. f"uBlock should block at least 20% of ads.\n" \
  586. f"Baseline: {baseline_result['adElementsVisible']} visible ads\n" \
  587. f"With extension: {ext_result['adElementsVisible']} visible ads\n" \
  588. f"Reduction: only {reduction_percent:.0f}% (expected at least 20%)\n" \
  589. f"Note: Filter lists must be downloaded on first run (takes ~15s)"
  590. print(f"\n✓ SUCCESS: uBlock correctly blocks ads!")
  591. print(f" - Baseline: {baseline_result['adElementsVisible']} visible ads")
  592. print(f" - With extension: {ext_result['adElementsVisible']} visible ads")
  593. print(f" - Blocked: {ads_blocked} ads ({reduction_percent:.0f}% reduction)")