test_twocaptcha.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338
  1. """
  2. Integration tests for twocaptcha plugin
  3. Run with: TWOCAPTCHA_API_KEY=your_key pytest archivebox/plugins/twocaptcha/tests/ -xvs
  4. NOTE: Chrome 137+ removed --load-extension support, so these tests MUST use Chromium.
  5. """
  6. import json
  7. import os
  8. import signal
  9. import subprocess
  10. import tempfile
  11. import time
  12. from pathlib import Path
  13. import pytest
  14. from archivebox.plugins.chrome.tests.chrome_test_helpers import (
  15. setup_test_env,
  16. launch_chromium_session,
  17. kill_chromium_session,
  18. CHROME_LAUNCH_HOOK,
  19. PLUGINS_ROOT,
  20. )
  21. PLUGIN_DIR = Path(__file__).parent.parent
  22. INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__83_twocaptcha_install.js'
  23. CONFIG_SCRIPT = PLUGIN_DIR / 'on_Crawl__95_twocaptcha_config.js'
  24. TEST_URL = 'https://2captcha.com/demo/cloudflare-turnstile'
  25. # Alias for backward compatibility with existing test names
  26. launch_chrome = launch_chromium_session
  27. kill_chrome = kill_chromium_session
  28. class TestTwoCaptcha:
  29. """Integration tests requiring TWOCAPTCHA_API_KEY."""
  30. @pytest.fixture(autouse=True)
  31. def setup(self):
  32. self.api_key = os.environ.get('TWOCAPTCHA_API_KEY') or os.environ.get('API_KEY_2CAPTCHA')
  33. if not self.api_key:
  34. pytest.fail("TWOCAPTCHA_API_KEY required")
  35. def test_install_and_load(self):
  36. """Extension installs and loads in Chromium."""
  37. with tempfile.TemporaryDirectory() as tmpdir:
  38. tmpdir = Path(tmpdir)
  39. env = setup_test_env(tmpdir)
  40. env['TWOCAPTCHA_API_KEY'] = self.api_key
  41. # Install
  42. result = subprocess.run(['node', str(INSTALL_SCRIPT)], env=env, timeout=120, capture_output=True, text=True)
  43. assert result.returncode == 0, f"Install failed: {result.stderr}"
  44. cache = Path(env['CHROME_EXTENSIONS_DIR']) / 'twocaptcha.extension.json'
  45. assert cache.exists()
  46. data = json.loads(cache.read_text())
  47. assert data['webstore_id'] == 'ifibfemgeogfhoebkmokieepdoobkbpo'
  48. # Launch Chromium in crawls directory
  49. crawl_id = 'test'
  50. crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id
  51. chrome_dir = crawl_dir / 'chrome'
  52. env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
  53. process, cdp_url = launch_chrome(env, chrome_dir, crawl_id)
  54. try:
  55. # Wait for extensions.json to be written
  56. extensions_file = chrome_dir / 'extensions.json'
  57. for i in range(20):
  58. if extensions_file.exists():
  59. break
  60. time.sleep(0.5)
  61. assert extensions_file.exists(), f"extensions.json not created. Chrome dir files: {list(chrome_dir.iterdir())}"
  62. exts = json.loads(extensions_file.read_text())
  63. assert any(e['name'] == 'twocaptcha' for e in exts), f"twocaptcha not loaded: {exts}"
  64. print(f"[+] Extension loaded: id={next(e['id'] for e in exts if e['name']=='twocaptcha')}")
  65. finally:
  66. kill_chrome(process, chrome_dir)
  67. def test_config_applied(self):
  68. """Configuration is applied to extension and verified via Config.getAll()."""
  69. with tempfile.TemporaryDirectory() as tmpdir:
  70. tmpdir = Path(tmpdir)
  71. env = setup_test_env(tmpdir)
  72. env['TWOCAPTCHA_API_KEY'] = self.api_key
  73. env['TWOCAPTCHA_RETRY_COUNT'] = '5'
  74. env['TWOCAPTCHA_RETRY_DELAY'] = '10'
  75. subprocess.run(['node', str(INSTALL_SCRIPT)], env=env, timeout=120, capture_output=True)
  76. # Launch Chromium in crawls directory
  77. crawl_id = 'cfg'
  78. crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id
  79. chrome_dir = crawl_dir / 'chrome'
  80. env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
  81. process, cdp_url = launch_chrome(env, chrome_dir, crawl_id)
  82. try:
  83. # Wait for extensions.json to be written
  84. extensions_file = chrome_dir / 'extensions.json'
  85. for i in range(20):
  86. if extensions_file.exists():
  87. break
  88. time.sleep(0.5)
  89. assert extensions_file.exists(), f"extensions.json not created"
  90. result = subprocess.run(
  91. ['node', str(CONFIG_SCRIPT), '--url=https://example.com', '--snapshot-id=test'],
  92. env=env, timeout=30, capture_output=True, text=True
  93. )
  94. assert result.returncode == 0, f"Config failed: {result.stderr}"
  95. assert (chrome_dir / '.twocaptcha_configured').exists()
  96. # Verify config via options.html and Config.getAll()
  97. # Get the actual extension ID from the config marker (Chrome computes IDs differently)
  98. config_marker = json.loads((chrome_dir / '.twocaptcha_configured').read_text())
  99. ext_id = config_marker['extensionId']
  100. script = f'''
  101. if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
  102. const puppeteer = require('puppeteer-core');
  103. (async () => {{
  104. const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
  105. // Load options.html and use Config.getAll() to verify
  106. const optionsUrl = 'chrome-extension://{ext_id}/options/options.html';
  107. const page = await browser.newPage();
  108. console.error('[*] Loading options page:', optionsUrl);
  109. // Navigate - catch error but continue since page may still load
  110. try {{
  111. await page.goto(optionsUrl, {{ waitUntil: 'networkidle0', timeout: 10000 }});
  112. }} catch (e) {{
  113. console.error('[*] Navigation threw error (may still work):', e.message);
  114. }}
  115. // Wait for page to settle
  116. await new Promise(r => setTimeout(r, 2000));
  117. console.error('[*] Current URL:', page.url());
  118. // Wait for Config object to be available
  119. await page.waitForFunction(() => typeof Config !== 'undefined', {{ timeout: 5000 }});
  120. // Call Config.getAll() - the extension's own API (returns a Promise)
  121. const cfg = await page.evaluate(async () => await Config.getAll());
  122. console.error('[*] Config.getAll() returned:', JSON.stringify(cfg));
  123. await page.close();
  124. browser.disconnect();
  125. console.log(JSON.stringify(cfg));
  126. }})();
  127. '''
  128. (tmpdir / 'v.js').write_text(script)
  129. r = subprocess.run(['node', str(tmpdir / 'v.js')], env=env, timeout=30, capture_output=True, text=True)
  130. print(r.stderr)
  131. assert r.returncode == 0, f"Verify failed: {r.stderr}"
  132. cfg = json.loads(r.stdout.strip().split('\n')[-1])
  133. print(f"[*] Config from extension: {json.dumps(cfg, indent=2)}")
  134. # Verify all the fields we care about
  135. assert cfg.get('apiKey') == self.api_key or cfg.get('api_key') == self.api_key, f"API key not set: {cfg}"
  136. assert cfg.get('isPluginEnabled') == True, f"Plugin not enabled: {cfg}"
  137. assert cfg.get('repeatOnErrorTimes') == 5, f"Retry count wrong: {cfg}"
  138. assert cfg.get('repeatOnErrorDelay') == 10, f"Retry delay wrong: {cfg}"
  139. assert cfg.get('autoSolveRecaptchaV2') == True, f"autoSolveRecaptchaV2 not enabled: {cfg}"
  140. assert cfg.get('autoSolveRecaptchaV3') == True, f"autoSolveRecaptchaV3 not enabled: {cfg}"
  141. assert cfg.get('autoSolveTurnstile') == True, f"autoSolveTurnstile not enabled: {cfg}"
  142. assert cfg.get('enabledForRecaptchaV2') == True, f"enabledForRecaptchaV2 not enabled: {cfg}"
  143. print(f"[+] Config verified via Config.getAll()!")
  144. finally:
  145. kill_chrome(process, chrome_dir)
  146. def test_solves_recaptcha(self):
  147. """Extension attempts to solve CAPTCHA on demo page.
  148. CRITICAL: DO NOT SKIP OR DISABLE THIS TEST EVEN IF IT'S FLAKY!
  149. This test is INTENTIONALLY left enabled to expose the REAL, ACTUAL flakiness
  150. of the 2captcha service and demo page. The test failures you see here are NOT
  151. test bugs - they are ACCURATE representations of the real-world reliability
  152. of this CAPTCHA solving service.
  153. If this test is flaky, that's because 2captcha IS FLAKY in production.
  154. If this test fails intermittently, that's because 2captcha FAILS INTERMITTENTLY in production.
  155. NEVER EVER hide real flakiness by disabling tests or adding @pytest.mark.skip.
  156. Users NEED to see this failure rate to understand what they're getting into.
  157. When this test DOES pass, it confirms:
  158. - Extension loads and configures correctly
  159. - 2captcha API key is accepted
  160. - Extension can successfully auto-solve CAPTCHAs
  161. - The entire flow works end-to-end
  162. When it fails (as it often does):
  163. - Demo page has JavaScript errors (representing real-world broken sites)
  164. - Turnstile tokens expire before solving (representing real-world timing issues)
  165. - 2captcha service may be slow/down (representing real-world service issues)
  166. This is VALUABLE INFORMATION about the service. DO NOT HIDE IT.
  167. """
  168. with tempfile.TemporaryDirectory() as tmpdir:
  169. tmpdir = Path(tmpdir)
  170. env = setup_test_env(tmpdir)
  171. env['TWOCAPTCHA_API_KEY'] = self.api_key
  172. subprocess.run(['node', str(INSTALL_SCRIPT)], env=env, timeout=120, capture_output=True)
  173. # Launch Chromium in crawls directory
  174. crawl_id = 'solve'
  175. crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id
  176. chrome_dir = crawl_dir / 'chrome'
  177. env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
  178. process, cdp_url = launch_chrome(env, chrome_dir, crawl_id)
  179. try:
  180. # Wait for extensions.json to be written
  181. extensions_file = chrome_dir / 'extensions.json'
  182. for i in range(20):
  183. if extensions_file.exists():
  184. break
  185. time.sleep(0.5)
  186. assert extensions_file.exists(), f"extensions.json not created"
  187. subprocess.run(['node', str(CONFIG_SCRIPT), '--url=x', '--snapshot-id=x'], env=env, timeout=30, capture_output=True)
  188. script = f'''
  189. if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
  190. const puppeteer = require('puppeteer-core');
  191. (async () => {{
  192. const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
  193. const page = await browser.newPage();
  194. // Capture console messages from the page (including extension messages)
  195. page.on('console', msg => {{
  196. const text = msg.text();
  197. if (text.includes('2captcha') || text.includes('turnstile') || text.includes('captcha')) {{
  198. console.error('[CONSOLE]', text);
  199. }}
  200. }});
  201. await page.setViewport({{ width: 1440, height: 900 }});
  202. console.error('[*] Loading {TEST_URL}...');
  203. await page.goto('{TEST_URL}', {{ waitUntil: 'networkidle2', timeout: 30000 }});
  204. // Wait for CAPTCHA iframe (minimal wait to avoid token expiration)
  205. console.error('[*] Waiting for CAPTCHA iframe...');
  206. await page.waitForSelector('iframe', {{ timeout: 30000 }});
  207. console.error('[*] CAPTCHA iframe found - extension should auto-solve now');
  208. // DON'T CLICK - extension should auto-solve since autoSolveTurnstile=True
  209. console.error('[*] Waiting for auto-solve (extension configured with autoSolveTurnstile=True)...');
  210. // Poll for data-state changes with debug output
  211. console.error('[*] Waiting for CAPTCHA to be solved (up to 150s)...');
  212. const start = Date.now();
  213. let solved = false;
  214. let lastState = null;
  215. while (!solved && (Date.now() - start) < 150000) {{
  216. const state = await page.evaluate(() => {{
  217. const solver = document.querySelector('.captcha-solver');
  218. return {{
  219. state: solver?.getAttribute('data-state'),
  220. text: solver?.textContent?.trim(),
  221. classList: solver?.className
  222. }};
  223. }});
  224. if (state.state !== lastState) {{
  225. const elapsed = Math.round((Date.now() - start) / 1000);
  226. console.error(`[*] State change at ${{elapsed}}s: "${{lastState}}" -> "${{state.state}}" (text: "${{state.text?.slice(0, 50)}}")`);
  227. lastState = state.state;
  228. }}
  229. if (state.state === 'solved') {{
  230. solved = true;
  231. const elapsed = Math.round((Date.now() - start) / 1000);
  232. console.error('[+] SOLVED in ' + elapsed + 's!');
  233. break;
  234. }}
  235. // Check every 2 seconds
  236. await new Promise(r => setTimeout(r, 2000));
  237. }}
  238. if (!solved) {{
  239. const elapsed = Math.round((Date.now() - start) / 1000);
  240. const finalState = await page.evaluate(() => {{
  241. const solver = document.querySelector('.captcha-solver');
  242. return {{
  243. state: solver?.getAttribute('data-state'),
  244. text: solver?.textContent?.trim(),
  245. html: solver?.outerHTML?.slice(0, 200)
  246. }};
  247. }});
  248. console.error(`[!] TIMEOUT after ${{elapsed}}s. Final state: ${{JSON.stringify(finalState)}}`);
  249. browser.disconnect();
  250. process.exit(1);
  251. }}
  252. const final = await page.evaluate(() => {{
  253. const solver = document.querySelector('.captcha-solver');
  254. return {{
  255. solved: true,
  256. state: solver?.getAttribute('data-state'),
  257. text: solver?.textContent?.trim()
  258. }};
  259. }});
  260. browser.disconnect();
  261. console.log(JSON.stringify(final));
  262. }})();
  263. '''
  264. (tmpdir / 's.js').write_text(script)
  265. print("\n[*] Solving CAPTCHA (this can take up to 150s for 2captcha API)...")
  266. r = subprocess.run(['node', str(tmpdir / 's.js')], env=env, timeout=200, capture_output=True, text=True)
  267. print(r.stderr)
  268. assert r.returncode == 0, f"Failed: {r.stderr}"
  269. final = json.loads([l for l in r.stdout.strip().split('\n') if l.startswith('{')][-1])
  270. assert final.get('solved'), f"Not solved: {final}"
  271. assert final.get('state') == 'solved', f"State not 'solved': {final}"
  272. print(f"[+] SUCCESS! CAPTCHA solved: {final.get('text','')[:50]}")
  273. finally:
  274. kill_chrome(process, chrome_dir)
  275. if __name__ == '__main__':
  276. pytest.main([__file__, '-xvs'])