Browse Source

Add Persona class with cleanup_chrome() method

- Create Persona class in personas/models.py for managing browser
  profiles/identities used for archiving sessions

- Each Persona has:
  - chrome_user_data_dir: Chrome profile directory
  - chrome_extensions_dir: Installed extensions
  - cookies_file: Cookies for wget/curl
  - config_file: Persona-specific config overrides

- Add Persona methods:
  - cleanup_chrome(): Remove stale SingletonLock/SingletonSocket files
  - get_config(): Load persona config from config.json
  - save_config(): Save persona config to config.json
  - ensure_dirs(): Create persona directory structure
  - all(): Iterator over all personas
  - get_active(): Get persona based on ACTIVE_PERSONA config
  - cleanup_chrome_all(): Clean up all personas

- Update chrome_cleanup() in misc/util.py to use Persona.cleanup_chrome_all()
  instead of manual directory iteration

- Add convenience functions:
  - cleanup_chrome_for_persona(name)
  - cleanup_chrome_all_personas()
Claude 1 month ago
parent
commit
503a2f77cb
2 changed files with 259 additions and 82 deletions
  1. 12 23
      archivebox/misc/util.py
  2. 247 59
      archivebox/personas/models.py

+ 12 - 23
archivebox/misc/util.py

@@ -482,22 +482,25 @@ def chrome_cleanup():
     """
     Cleans up any state or runtime files that Chrome leaves behind when killed by
     a timeout or other error. Handles:
-    - Persona-based chrome_user_data directories (from ACTIVE_PERSONA)
-    - Explicit CHROME_USER_DATA_DIR
+    - All persona chrome_user_data directories (via Persona.cleanup_chrome_all())
+    - Explicit CHROME_USER_DATA_DIR from config
     - Legacy Docker chromium path
     """
     import os
     from pathlib import Path
     from archivebox.config.permissions import IN_DOCKER
 
-    # Clean up persona-based user data directories
+    # Clean up all persona chrome directories using Persona class
     try:
-        from archivebox.config.configset import get_config
-        from archivebox.config.constants import CONSTANTS
+        from archivebox.personas.models import Persona
 
-        config = get_config()
+        # Clean up all personas
+        Persona.cleanup_chrome_all()
 
-        # Clean up the active persona's chrome_user_data SingletonLock
+        # Also clean up the active persona's explicit CHROME_USER_DATA_DIR if set
+        # (in case it's a custom path not under PERSONAS_DIR)
+        from archivebox.config.configset import get_config
+        config = get_config()
         chrome_user_data_dir = config.get('CHROME_USER_DATA_DIR')
         if chrome_user_data_dir:
             singleton_lock = Path(chrome_user_data_dir) / 'SingletonLock'
@@ -506,24 +509,10 @@ def chrome_cleanup():
                     singleton_lock.unlink()
                 except OSError:
                     pass
-
-        # Clean up all persona directories
-        personas_dir = CONSTANTS.PERSONAS_DIR
-        if personas_dir.exists():
-            for persona_dir in personas_dir.iterdir():
-                if not persona_dir.is_dir():
-                    continue
-                user_data_dir = persona_dir / 'chrome_user_data'
-                singleton_lock = user_data_dir / 'SingletonLock'
-                if singleton_lock.exists():
-                    try:
-                        singleton_lock.unlink()
-                    except OSError:
-                        pass
     except Exception:
-        pass  # Config not available during early startup
+        pass  # Persona/config not available during early startup
 
-    # Legacy Docker cleanup
+    # Legacy Docker cleanup (for backwards compatibility)
     if IN_DOCKER:
         singleton_lock = "/home/archivebox/.config/chromium/SingletonLock"
         if os.path.lexists(singleton_lock):

+ 247 - 59
archivebox/personas/models.py

@@ -1,59 +1,247 @@
-# from django.db import models
-
-# from django.conf import settings
-
-
-# class Persona(models.Model):
-#     """Aka a "SessionType", its a template for a crawler browsing session containing some config."""
-
-#     id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
-    
-#     created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
-#     created_at = AutoDateTimeField(default=None, null=False, db_index=True)
-#     modified_at = models.DateTimeField(auto_now=True)
-    
-#     name = models.CharField(max_length=100, blank=False, null=False, editable=False)
-    
-#     persona_dir = models.FilePathField(path=settings.PERSONAS_DIR, allow_files=False, allow_folders=True, blank=True, null=False, editable=False)
-#     config = models.JSONField(default=dict)
-#     # e.g. {
-#     #    USER_AGENT: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
-#     #    COOKIES_TXT_FILE: '/path/to/cookies.txt',
-#     #    CHROME_USER_DATA_DIR: '/path/to/chrome/user/data/dir',
-#     #    CHECK_SSL_VALIDITY: False,
-#     #    SAVE_ARCHIVEDOTORG: True,
-#     #    CHROME_BINARY: 'chromium'
-#     #    ...
-#     # }
-#     # domain_allowlist = models.CharField(max_length=1024, blank=True, null=False, default='')
-#     # domain_denylist = models.CharField(max_length=1024, blank=True, null=False, default='')
-    
-#     class Meta:
-#         app_label = 'personas'
-#         verbose_name = 'Session Type'
-#         verbose_name_plural = 'Session Types'
-#         unique_together = (('created_by', 'name'),)
-    
-
-#     def clean(self):
-#         self.persona_dir = settings.PERSONAS_DIR / self.name
-#         assert self.persona_dir == settings.PERSONAS_DIR / self.name, f'Persona dir {self.persona_dir} must match settings.PERSONAS_DIR / self.name'
-        
-        
-#         # make sure config keys all exist in FLAT_CONFIG
-#         # make sure config values all match expected types
-#         pass
-        
-#     def save(self, *args, **kwargs):
-#         self.full_clean()
-        
-#         # make sure basic file structure is present in persona_dir:
-#         # - PERSONAS_DIR / self.name / 
-#         #   - chrome_profile/
-#         #   - chrome_downloads/
-#         #   - chrome_extensions/
-#         #   - cookies.txt
-#         #   - auth.json
-#         #   - config.json    # json dump of the model
-        
-#         super().save(*args, **kwargs)
+"""
+Persona management for ArchiveBox.
+
+A Persona represents a browser profile/identity used for archiving.
+Each persona has its own:
+- Chrome user data directory (for cookies, localStorage, extensions, etc.)
+- Chrome extensions directory
+- Cookies file
+- Config overrides
+
+Personas are stored as directories under PERSONAS_DIR (default: data/personas/).
+"""
+
+__package__ = 'archivebox.personas'
+
+from pathlib import Path
+from typing import Optional, Dict, Any, Iterator
+
+
+class Persona:
+    """
+    Represents a browser persona/profile for archiving sessions.
+
+    Each persona is a directory containing:
+    - chrome_user_data/     Chrome profile directory
+    - chrome_extensions/    Installed extensions
+    - cookies.txt           Cookies file for wget/curl
+    - config.json           Persona-specific config overrides
+
+    Usage:
+        persona = Persona('Default')
+        persona.cleanup_chrome()
+
+        # Or iterate all personas:
+        for persona in Persona.all():
+            persona.cleanup_chrome()
+    """
+
+    def __init__(self, name: str, personas_dir: Optional[Path] = None):
+        """
+        Initialize a Persona by name.
+
+        Args:
+            name: Persona name (directory name under PERSONAS_DIR)
+            personas_dir: Override PERSONAS_DIR (defaults to CONSTANTS.PERSONAS_DIR)
+        """
+        self.name = name
+
+        if personas_dir is None:
+            from archivebox.config.constants import CONSTANTS
+            personas_dir = CONSTANTS.PERSONAS_DIR
+
+        self.personas_dir = Path(personas_dir)
+        self.path = self.personas_dir / name
+
+    @property
+    def chrome_user_data_dir(self) -> Path:
+        """Path to Chrome user data directory for this persona."""
+        return self.path / 'chrome_user_data'
+
+    @property
+    def chrome_extensions_dir(self) -> Path:
+        """Path to Chrome extensions directory for this persona."""
+        return self.path / 'chrome_extensions'
+
+    @property
+    def cookies_file(self) -> Path:
+        """Path to cookies.txt file for this persona."""
+        return self.path / 'cookies.txt'
+
+    @property
+    def config_file(self) -> Path:
+        """Path to config.json file for this persona."""
+        return self.path / 'config.json'
+
+    @property
+    def singleton_lock(self) -> Path:
+        """Path to Chrome's SingletonLock file."""
+        return self.chrome_user_data_dir / 'SingletonLock'
+
+    def exists(self) -> bool:
+        """Check if persona directory exists."""
+        return self.path.is_dir()
+
+    def ensure_dirs(self) -> None:
+        """Create persona directories if they don't exist."""
+        self.path.mkdir(parents=True, exist_ok=True)
+        self.chrome_user_data_dir.mkdir(parents=True, exist_ok=True)
+        self.chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
+
+    def cleanup_chrome(self) -> bool:
+        """
+        Clean up Chrome state files for this persona.
+
+        Removes stale SingletonLock files left behind when Chrome crashes
+        or is killed unexpectedly. This allows Chrome to start fresh.
+
+        Returns:
+            True if cleanup was performed, False if no cleanup needed
+        """
+        cleaned = False
+
+        # Remove SingletonLock if it exists
+        if self.singleton_lock.exists():
+            try:
+                self.singleton_lock.unlink()
+                cleaned = True
+            except OSError:
+                pass  # May be in use by active Chrome
+
+        # Also clean up any other stale lock files Chrome might leave
+        if self.chrome_user_data_dir.exists():
+            for lock_file in self.chrome_user_data_dir.glob('**/SingletonLock'):
+                try:
+                    lock_file.unlink()
+                    cleaned = True
+                except OSError:
+                    pass
+
+            # Clean up socket files
+            for socket_file in self.chrome_user_data_dir.glob('**/SingletonSocket'):
+                try:
+                    socket_file.unlink()
+                    cleaned = True
+                except OSError:
+                    pass
+
+        return cleaned
+
+    def get_config(self) -> Dict[str, Any]:
+        """
+        Load persona-specific config overrides from config.json.
+
+        Returns:
+            Dict of config overrides, or empty dict if no config file
+        """
+        import json
+
+        if not self.config_file.exists():
+            return {}
+
+        try:
+            return json.loads(self.config_file.read_text())
+        except (json.JSONDecodeError, OSError):
+            return {}
+
+    def save_config(self, config: Dict[str, Any]) -> None:
+        """
+        Save persona-specific config overrides to config.json.
+
+        Args:
+            config: Dict of config overrides to save
+        """
+        import json
+
+        self.ensure_dirs()
+        self.config_file.write_text(json.dumps(config, indent=2))
+
+    @classmethod
+    def all(cls, personas_dir: Optional[Path] = None) -> Iterator['Persona']:
+        """
+        Iterate over all personas in PERSONAS_DIR.
+
+        Args:
+            personas_dir: Override PERSONAS_DIR (defaults to CONSTANTS.PERSONAS_DIR)
+
+        Yields:
+            Persona instances for each persona directory
+        """
+        if personas_dir is None:
+            from archivebox.config.constants import CONSTANTS
+            personas_dir = CONSTANTS.PERSONAS_DIR
+
+        personas_dir = Path(personas_dir)
+
+        if not personas_dir.exists():
+            return
+
+        for persona_path in personas_dir.iterdir():
+            if persona_path.is_dir():
+                yield cls(persona_path.name, personas_dir)
+
+    @classmethod
+    def get_active(cls) -> 'Persona':
+        """
+        Get the currently active persona based on ACTIVE_PERSONA config.
+
+        Returns:
+            Persona instance for the active persona
+        """
+        from archivebox.config.configset import get_config
+
+        config = get_config()
+        active_name = config.get('ACTIVE_PERSONA', 'Default')
+        return cls(active_name)
+
+    @classmethod
+    def cleanup_chrome_all(cls, personas_dir: Optional[Path] = None) -> int:
+        """
+        Clean up Chrome state files for all personas.
+
+        Args:
+            personas_dir: Override PERSONAS_DIR (defaults to CONSTANTS.PERSONAS_DIR)
+
+        Returns:
+            Number of personas that had cleanup performed
+        """
+        cleaned_count = 0
+        for persona in cls.all(personas_dir):
+            if persona.cleanup_chrome():
+                cleaned_count += 1
+        return cleaned_count
+
+    def __str__(self) -> str:
+        return f"Persona({self.name})"
+
+    def __repr__(self) -> str:
+        return f"Persona(name={self.name!r}, path={self.path!r})"
+
+
+# Convenience functions for use without instantiating Persona class
+
+def cleanup_chrome_for_persona(name: str, personas_dir: Optional[Path] = None) -> bool:
+    """
+    Clean up Chrome state files for a specific persona.
+
+    Args:
+        name: Persona name
+        personas_dir: Override PERSONAS_DIR (defaults to CONSTANTS.PERSONAS_DIR)
+
+    Returns:
+        True if cleanup was performed, False if no cleanup needed
+    """
+    return Persona(name, personas_dir).cleanup_chrome()
+
+
+def cleanup_chrome_all_personas(personas_dir: Optional[Path] = None) -> int:
+    """
+    Clean up Chrome state files for all personas.
+
+    Args:
+        personas_dir: Override PERSONAS_DIR (defaults to CONSTANTS.PERSONAS_DIR)
+
+    Returns:
+        Number of personas that had cleanup performed
+    """
+    return Persona.cleanup_chrome_all(personas_dir)