Browse Source

Merge branch 'dev' into plugins-browsertrix

Nick Sweeting 1 year ago
parent
commit
c22df0b63a
4 changed files with 38 additions and 5 deletions
  1. 7 1
      Dockerfile
  2. 14 3
      archivebox/config.py
  3. 2 1
      archivebox/util.py
  4. 15 0
      docker-compose.yml

+ 7 - 1
Dockerfile

@@ -267,7 +267,13 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
 # Setup ArchiveBox runtime config
 WORKDIR "$DATA_DIR"
 ENV IN_DOCKER=True \
-    CUSTOM_TEMPLATES_DIR=/data/templates
+    DISPLAY=novnc:0.0 \
+    CUSTOM_TEMPLATES_DIR=/data/templates \
+    CHROME_USER_DATA_DIR=/data/personas/Default/chromium \
+    GOOGLE_API_KEY=no \
+    GOOGLE_DEFAULT_CLIENT_ID=no \
+    GOOGLE_DEFAULT_CLIENT_SECRET=no \
+    ALLOWED_HOSTS=*
     ## No need to set explicitly, these values will be autodetected by archivebox in docker:
     # CHROME_SANDBOX=False \
     # WGET_BINARY="wget" \

+ 14 - 3
archivebox/config.py

@@ -142,9 +142,10 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
         'CHECK_SSL_VALIDITY':       {'type': bool,  'default': True},
         'MEDIA_MAX_SIZE':           {'type': str,   'default': '750m'},
 
-        'CURL_USER_AGENT':          {'type': str,   'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
-        'WGET_USER_AGENT':          {'type': str,   'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
-        'CHROME_USER_AGENT':        {'type': str,   'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
+        'USER_AGENT':               {'type': str,   'default': None},
+        'CURL_USER_AGENT':          {'type': str,   'default': lambda c: c['USER_AGENT'] or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
+        'WGET_USER_AGENT':          {'type': str,   'default': lambda c: c['USER_AGENT'] or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
+        'CHROME_USER_AGENT':        {'type': str,   'default': lambda c: c['USER_AGENT'] or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
 
         'COOKIES_FILE':             {'type': str,   'default': None},
         'CHROME_USER_DATA_DIR':     {'type': str,   'default': None},
@@ -280,6 +281,7 @@ TEMPLATES_DIR_NAME = 'templates'
 ARCHIVE_DIR_NAME = 'archive'
 SOURCES_DIR_NAME = 'sources'
 LOGS_DIR_NAME = 'logs'
+PERSONAS_DIR_NAME = 'personas'
 SQL_INDEX_FILENAME = 'index.sqlite3'
 JSON_INDEX_FILENAME = 'index.json'
 HTML_INDEX_FILENAME = 'index.html'
@@ -356,6 +358,7 @@ ALLOWED_IN_OUTPUT_DIR = {
     ARCHIVE_DIR_NAME,
     SOURCES_DIR_NAME,
     LOGS_DIR_NAME,
+    PERSONAS_DIR_NAME,
     SQL_INDEX_FILENAME,
     f'{SQL_INDEX_FILENAME}-wal',
     f'{SQL_INDEX_FILENAME}-shm',
@@ -506,6 +509,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
     'ARCHIVE_DIR':              {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME},
     'SOURCES_DIR':              {'default': lambda c: c['OUTPUT_DIR'] / SOURCES_DIR_NAME},
     'LOGS_DIR':                 {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME},
+    'PERSONAS_DIR':             {'default': lambda c: c['OUTPUT_DIR'] / PERSONAS_DIR_NAME},
     'CONFIG_FILE':              {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
     'COOKIES_FILE':             {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
     'CHROME_USER_DATA_DIR':     {'default': lambda c: Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None},
@@ -1035,6 +1039,11 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
             'enabled': True,
             'is_valid': config['LOGS_DIR'].exists(),
         },
+        'PERSONAS': {
+            'path': config['PERSONAS'].resolve(),
+            'enabled': True,
+            'is_valid': config['PERSONAS'].exists(),
+        },
         'ARCHIVE_DIR': {
             'path': config['ARCHIVE_DIR'].resolve(),
             'enabled': True,
@@ -1382,6 +1391,8 @@ def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CO
 
     (Path(output_dir) / SOURCES_DIR_NAME).mkdir(exist_ok=True)
     (Path(output_dir) / LOGS_DIR_NAME).mkdir(exist_ok=True)
+    (Path(output_dir) / PERSONAS_DIR_NAME).mkdir(exist_ok=True)
+    (Path(output_dir) / PERSONAS_DIR_NAME / 'Default').mkdir(exist_ok=True)
 
 
 

+ 2 - 1
archivebox/util.py

@@ -303,10 +303,11 @@ def chrome_args(**options) -> List[str]:
 
     if options['CHROME_USER_DATA_DIR']:
         cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
-
+        cmd_args.append('--profile-directory=Default')
 
     return dedupe(cmd_args)
 
+
 def chrome_cleanup():
     """
     Cleans up any state or runtime files that chrome leaves behind when killed by

+ 15 - 0
docker-compose.yml

@@ -135,6 +135,21 @@ services:
     #         - ./data:/var/www
 
 
+    ### Example: Watch the ArchiveBox browser in realtime as it archives things,
+    # or remote control it to set up logins and credentials for sites you want to archive.
+    # https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install#setting-up-a-chromium-user-profile
+
+    novnc:
+        image: theasp/novnc:latest
+        environment:
+            - DISPLAY_WIDTH=1920
+            - DISPLAY_HEIGHT=1080
+            - RUN_XTERM=no
+        ports:
+            # to view/control ArchiveBox's browser, visit: http://localhost:8080/vnc.html
+            - "8080:8080"
+
+
     ### Example: run all your ArchiveBox traffic through a WireGuard VPN tunnel
 
     # wireguard: