models.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. """
  2. Persona management for ArchiveBox.
  3. A Persona represents a browser profile/identity used for archiving.
  4. Each persona has its own:
  5. - Chrome user data directory (for cookies, localStorage, extensions, etc.)
  6. - Chrome extensions directory
  7. - Cookies file
  8. - Config overrides
  9. """
  10. __package__ = 'archivebox.personas'
  11. from pathlib import Path
  12. from typing import TYPE_CHECKING, Iterator
  13. from django.db import models
  14. from django.conf import settings
  15. from django.utils import timezone
  16. from archivebox.base_models.models import ModelWithConfig, get_or_create_system_user_pk
  17. from archivebox.uuid_compat import uuid7
  18. if TYPE_CHECKING:
  19. from django.db.models import QuerySet
  20. class Persona(ModelWithConfig):
  21. """
  22. Browser persona/profile for archiving sessions.
  23. Each persona provides:
  24. - CHROME_USER_DATA_DIR: Chrome profile directory
  25. - CHROME_EXTENSIONS_DIR: Installed extensions directory
  26. - CHROME_DOWNLOADS_DIR: Chrome downloads directory
  27. - COOKIES_FILE: Cookies file for wget/curl
  28. - config: JSON field with persona-specific config overrides
  29. Usage:
  30. # Get persona and its derived config
  31. config = get_config(persona=crawl.persona, crawl=crawl, snapshot=snapshot)
  32. chrome_dir = config['CHROME_USER_DATA_DIR']
  33. # Or access directly from persona
  34. persona = Persona.objects.get(name='Default')
  35. persona.CHROME_USER_DATA_DIR # -> Path to chrome_user_data
  36. """
  37. id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
  38. name = models.CharField(max_length=64, unique=True)
  39. created_at = models.DateTimeField(default=timezone.now, db_index=True)
  40. created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk)
  41. class Meta:
  42. app_label = 'personas'
  43. def __str__(self) -> str:
  44. return self.name
  45. @property
  46. def path(self) -> Path:
  47. """Path to persona directory under PERSONAS_DIR."""
  48. from archivebox.config.constants import CONSTANTS
  49. return CONSTANTS.PERSONAS_DIR / self.name
  50. @property
  51. def CHROME_USER_DATA_DIR(self) -> str:
  52. """Derived path to Chrome user data directory for this persona."""
  53. return str(self.path / 'chrome_user_data')
  54. @property
  55. def CHROME_EXTENSIONS_DIR(self) -> str:
  56. """Derived path to Chrome extensions directory for this persona."""
  57. return str(self.path / 'chrome_extensions')
  58. @property
  59. def CHROME_DOWNLOADS_DIR(self) -> str:
  60. """Derived path to Chrome downloads directory for this persona."""
  61. return str(self.path / 'chrome_downloads')
  62. @property
  63. def COOKIES_FILE(self) -> str:
  64. """Derived path to cookies.txt file for this persona (if exists)."""
  65. cookies_path = self.path / 'cookies.txt'
  66. return str(cookies_path) if cookies_path.exists() else ''
  67. def get_derived_config(self) -> dict:
  68. """
  69. Get config dict with derived paths filled in.
  70. Returns dict with:
  71. - All values from self.config JSONField
  72. - CHROME_USER_DATA_DIR (derived from persona path)
  73. - CHROME_EXTENSIONS_DIR (derived from persona path)
  74. - CHROME_DOWNLOADS_DIR (derived from persona path)
  75. - COOKIES_FILE (derived from persona path, if file exists)
  76. - ACTIVE_PERSONA (set to this persona's name)
  77. """
  78. derived = dict(self.config or {})
  79. # Add derived paths (don't override if explicitly set in config)
  80. if 'CHROME_USER_DATA_DIR' not in derived:
  81. derived['CHROME_USER_DATA_DIR'] = self.CHROME_USER_DATA_DIR
  82. if 'CHROME_EXTENSIONS_DIR' not in derived:
  83. derived['CHROME_EXTENSIONS_DIR'] = self.CHROME_EXTENSIONS_DIR
  84. if 'CHROME_DOWNLOADS_DIR' not in derived:
  85. derived['CHROME_DOWNLOADS_DIR'] = self.CHROME_DOWNLOADS_DIR
  86. if 'COOKIES_FILE' not in derived and self.COOKIES_FILE:
  87. derived['COOKIES_FILE'] = self.COOKIES_FILE
  88. # Always set ACTIVE_PERSONA to this persona's name
  89. derived['ACTIVE_PERSONA'] = self.name
  90. return derived
  91. def ensure_dirs(self) -> None:
  92. """Create persona directories if they don't exist."""
  93. self.path.mkdir(parents=True, exist_ok=True)
  94. (self.path / 'chrome_user_data').mkdir(parents=True, exist_ok=True)
  95. (self.path / 'chrome_extensions').mkdir(parents=True, exist_ok=True)
  96. (self.path / 'chrome_downloads').mkdir(parents=True, exist_ok=True)
  97. def cleanup_chrome(self) -> bool:
  98. """
  99. Clean up Chrome state files (SingletonLock, etc.) for this persona.
  100. Returns:
  101. True if cleanup was performed, False if no cleanup needed
  102. """
  103. cleaned = False
  104. chrome_dir = self.path / 'chrome_user_data'
  105. if not chrome_dir.exists():
  106. return False
  107. # Clean up SingletonLock files
  108. for lock_file in chrome_dir.glob('**/SingletonLock'):
  109. try:
  110. lock_file.unlink()
  111. cleaned = True
  112. except OSError:
  113. pass
  114. # Clean up SingletonSocket files
  115. for socket_file in chrome_dir.glob('**/SingletonSocket'):
  116. try:
  117. socket_file.unlink()
  118. cleaned = True
  119. except OSError:
  120. pass
  121. return cleaned
  122. @classmethod
  123. def get_or_create_default(cls) -> 'Persona':
  124. """Get or create the Default persona."""
  125. persona, _ = cls.objects.get_or_create(name='Default')
  126. return persona
  127. @classmethod
  128. def cleanup_chrome_all(cls) -> int:
  129. """Clean up Chrome state files for all personas."""
  130. cleaned = 0
  131. for persona in cls.objects.all():
  132. if persona.cleanup_chrome():
  133. cleaned += 1
  134. return cleaned