Browse Source

Derive CHROME_USER_DATA_DIR from ACTIVE_PERSONA in config system

- Add _derive_persona_paths() in configset.py to automatically derive
  CHROME_USER_DATA_DIR and CHROME_EXTENSIONS_DIR from ACTIVE_PERSONA
  when not explicitly set. This allows plugins to use these paths
  without knowing about the persona system.

- Update chrome_utils.js launchChromium() to accept userDataDir option
  and pass --user-data-dir to Chrome. Also cleans up SingletonLock
  before launch.

- Update killZombieChrome() to clean up SingletonLock files from all
  persona chrome_user_data directories after killing zombies.

- Update chrome_cleanup() in misc/util.py to handle persona-based
  user data directories when cleaning up stale Chrome state.

- Simplify on_Crawl__20_chrome_launch.bg.js to use CHROME_USER_DATA_DIR
  and CHROME_EXTENSIONS_DIR from env (derived by get_config()).

Config priority flow:
  ACTIVE_PERSONA=WorkAccount (set on crawl/snapshot)
  -> get_config() derives:
     CHROME_USER_DATA_DIR = PERSONAS_DIR/WorkAccount/chrome_user_data
     CHROME_EXTENSIONS_DIR = PERSONAS_DIR/WorkAccount/chrome_extensions
  -> hooks receive these as env vars without needing persona logic
Claude 1 month ago
parent
commit
877b5f91c2

+ 46 - 0
archivebox/config/configset.py

@@ -240,6 +240,52 @@ def get_config(
     except ImportError:
         pass
 
+    # Derive persona-based paths if not explicitly set
+    # This allows plugins to just use CHROME_USER_DATA_DIR without knowing about personas
+    config = _derive_persona_paths(config, CONSTANTS)
+
+    return config
+
+
+def _derive_persona_paths(config: Dict[str, Any], CONSTANTS: Any) -> Dict[str, Any]:
+    """
+    Derive persona-specific paths from ACTIVE_PERSONA if not explicitly set.
+
+    This runs after all config sources are merged, so plugins receive
+    the final resolved paths without needing to know about the persona system.
+
+    Derived paths:
+        CHROME_USER_DATA_DIR  <- PERSONAS_DIR / ACTIVE_PERSONA / chrome_user_data
+        CHROME_EXTENSIONS_DIR <- PERSONAS_DIR / ACTIVE_PERSONA / chrome_extensions
+        COOKIES_FILE          <- PERSONAS_DIR / ACTIVE_PERSONA / cookies.txt (if exists)
+    """
+    # Get active persona (defaults to "Default")
+    active_persona = config.get('ACTIVE_PERSONA') or config.get('DEFAULT_PERSONA') or 'Default'
+
+    # Ensure ACTIVE_PERSONA is always set in config for downstream use
+    config['ACTIVE_PERSONA'] = active_persona
+
+    # Get personas directory
+    personas_dir = CONSTANTS.PERSONAS_DIR
+    persona_dir = personas_dir / active_persona
+
+    # Derive CHROME_USER_DATA_DIR if not explicitly set
+    chrome_user_data_dir = config.get('CHROME_USER_DATA_DIR')
+    if not chrome_user_data_dir:
+        config['CHROME_USER_DATA_DIR'] = str(persona_dir / 'chrome_user_data')
+
+    # Derive CHROME_EXTENSIONS_DIR if not explicitly set
+    chrome_extensions_dir = config.get('CHROME_EXTENSIONS_DIR')
+    if not chrome_extensions_dir:
+        config['CHROME_EXTENSIONS_DIR'] = str(persona_dir / 'chrome_extensions')
+
+    # Derive COOKIES_FILE if not explicitly set and file exists
+    cookies_file = config.get('COOKIES_FILE')
+    if not cookies_file:
+        persona_cookies = persona_dir / 'cookies.txt'
+        if persona_cookies.exists():
+            config['COOKIES_FILE'] = str(persona_cookies)
+
     return config
 
 

+ 41 - 3
archivebox/misc/util.py

@@ -480,12 +480,50 @@ for url_str, num_urls in _test_url_strs.items():
 
 def chrome_cleanup():
     """
-    Cleans up any state or runtime files that chrome leaves behind when killed by
-    a timeout or other error
+    Cleans up any state or runtime files that Chrome leaves behind when killed by
+    a timeout or other error. Handles:
+    - Persona-based chrome_user_data directories (from ACTIVE_PERSONA)
+    - Explicit CHROME_USER_DATA_DIR
+    - Legacy Docker chromium path
     """
     import os
+    from pathlib import Path
     from archivebox.config.permissions import IN_DOCKER
-    
+
+    # Clean up persona-based user data directories
+    try:
+        from archivebox.config.configset import get_config
+        from archivebox.config.constants import CONSTANTS
+
+        config = get_config()
+
+        # Clean up the active persona's chrome_user_data SingletonLock
+        chrome_user_data_dir = config.get('CHROME_USER_DATA_DIR')
+        if chrome_user_data_dir:
+            singleton_lock = Path(chrome_user_data_dir) / 'SingletonLock'
+            if singleton_lock.exists():
+                try:
+                    singleton_lock.unlink()
+                except OSError:
+                    pass
+
+        # Clean up all persona directories
+        personas_dir = CONSTANTS.PERSONAS_DIR
+        if personas_dir.exists():
+            for persona_dir in personas_dir.iterdir():
+                if not persona_dir.is_dir():
+                    continue
+                user_data_dir = persona_dir / 'chrome_user_data'
+                singleton_lock = user_data_dir / 'SingletonLock'
+                if singleton_lock.exists():
+                    try:
+                        singleton_lock.unlink()
+                    except OSError:
+                        pass
+    except Exception:
+        pass  # Config not available during early startup
+
+    # Legacy Docker cleanup
     if IN_DOCKER:
         singleton_lock = "/home/archivebox/.config/chromium/SingletonLock"
         if os.path.lexists(singleton_lock):

+ 46 - 0
archivebox/plugins/chrome/chrome_utils.js

@@ -257,6 +257,31 @@ function killZombieChrome(dataDir = null) {
         console.error('[+] No zombies found');
     }
 
+    // Clean up stale SingletonLock files from persona chrome_user_data directories
+    const personasDir = path.join(dataDir, 'personas');
+    if (fs.existsSync(personasDir)) {
+        try {
+            const personas = fs.readdirSync(personasDir, { withFileTypes: true });
+            for (const persona of personas) {
+                if (!persona.isDirectory()) continue;
+
+                const userDataDir = path.join(personasDir, persona.name, 'chrome_user_data');
+                const singletonLock = path.join(userDataDir, 'SingletonLock');
+
+                if (fs.existsSync(singletonLock)) {
+                    try {
+                        fs.unlinkSync(singletonLock);
+                        console.error(`[+] Removed stale SingletonLock: ${singletonLock}`);
+                    } catch (e) {
+                        // Ignore - may be in use by active Chrome
+                    }
+                }
+            }
+        } catch (e) {
+            // Ignore errors scanning personas directory
+        }
+    }
+
     return killed;
 }
 
@@ -270,6 +295,7 @@ function killZombieChrome(dataDir = null) {
  * @param {Object} options - Launch options
  * @param {string} [options.binary] - Chrome binary path (auto-detected if not provided)
  * @param {string} [options.outputDir='chrome'] - Directory for output files
+ * @param {string} [options.userDataDir] - Chrome user data directory for persistent sessions
  * @param {string} [options.resolution='1440,2000'] - Window resolution
  * @param {boolean} [options.headless=true] - Run in headless mode
  * @param {boolean} [options.checkSsl=true] - Check SSL certificates
@@ -281,6 +307,7 @@ async function launchChromium(options = {}) {
     const {
         binary = findChromium(),
         outputDir = 'chrome',
+        userDataDir = getEnv('CHROME_USER_DATA_DIR'),
         resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'),
         headless = getEnvBool('CHROME_HEADLESS', true),
         checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true)),
@@ -304,6 +331,24 @@ async function launchChromium(options = {}) {
         fs.mkdirSync(outputDir, { recursive: true });
     }
 
+    // Create user data directory if specified and doesn't exist
+    if (userDataDir) {
+        if (!fs.existsSync(userDataDir)) {
+            fs.mkdirSync(userDataDir, { recursive: true });
+            console.error(`[*] Created user data directory: ${userDataDir}`);
+        }
+        // Clean up any stale SingletonLock file from previous crashed sessions
+        const singletonLock = path.join(userDataDir, 'SingletonLock');
+        if (fs.existsSync(singletonLock)) {
+            try {
+                fs.unlinkSync(singletonLock);
+                console.error(`[*] Removed stale SingletonLock: ${singletonLock}`);
+            } catch (e) {
+                console.error(`[!] Failed to remove SingletonLock: ${e.message}`);
+            }
+        }
+    }
+
     // Find a free port
     const debugPort = await findFreePort();
     console.error(`[*] Using debug port: ${debugPort}`);
@@ -335,6 +380,7 @@ async function launchChromium(options = {}) {
         '--font-render-hinting=none',
         '--force-color-profile=srgb',
         `--window-size=${width},${height}`,
+        ...(userDataDir ? [`--user-data-dir=${userDataDir}`] : []),
         ...(headless ? ['--headless=new'] : []),
         ...(checkSsl ? [] : ['--ignore-certificate-errors']),
     ];

+ 10 - 3
archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js

@@ -115,12 +115,17 @@ async function main() {
         if (version) console.error(`[*] Version: ${version}`);
 
         // Load installed extensions
-        const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR') ||
-            path.join(getEnv('DATA_DIR', '.'), 'personas', getEnv('ACTIVE_PERSONA', 'Default'), 'chrome_extensions');
+        // CHROME_EXTENSIONS_DIR is derived from ACTIVE_PERSONA by get_config() in configset.py
+        const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR');
+        const userDataDir = getEnv('CHROME_USER_DATA_DIR');
+
+        if (userDataDir) {
+            console.error(`[*] Using user data dir: ${userDataDir}`);
+        }
 
         const installedExtensions = [];
         const extensionPaths = [];
-        if (fs.existsSync(extensionsDir)) {
+        if (extensionsDir && fs.existsSync(extensionsDir)) {
             const files = fs.readdirSync(extensionsDir);
             for (const file of files) {
                 if (file.endsWith('.extension.json')) {
@@ -151,9 +156,11 @@ async function main() {
         writePidWithMtime(path.join(OUTPUT_DIR, 'hook.pid'), process.pid, hookStartTime);
 
         // Launch Chromium using consolidated function
+        // userDataDir is derived from ACTIVE_PERSONA by get_config() if not explicitly set
         const result = await launchChromium({
             binary,
             outputDir: OUTPUT_DIR,
+            userDataDir,
             extensionPaths,
         });