Browse Source

make chrome binary and configs directly runnable and make extractor use external bin

Nick Sweeting 1 year ago
parent
commit
ac53fdf677

+ 13 - 9
archivebox/base_models/models.py

@@ -3,11 +3,12 @@ This file provides the Django ABIDField and ABIDModel base model to inherit from
 """
 """
 
 
 
 
-from typing import Any, Dict, Union, List, Set, cast
 
 
+import json
 from uuid import uuid4
 from uuid import uuid4
 from functools import partial
 from functools import partial
 from pathlib import Path
 from pathlib import Path
+from typing import Any, Dict, Union, List, Set, cast
 from charidfield import CharIDField  # type: ignore[import-untyped]
 from charidfield import CharIDField  # type: ignore[import-untyped]
 
 
 from django.contrib import admin
 from django.contrib import admin
@@ -27,6 +28,7 @@ from django_stubs_ext.db.models import TypedModelMeta
 
 
 
 
 from archivebox.index.json import to_json
 from archivebox.index.json import to_json
+from archivebox.misc.hashing import get_dir_info
 
 
 from .abid import (
 from .abid import (
     ABID,
     ABID,
@@ -590,18 +592,20 @@ class ModelWithOutputDir(ABIDModel):
         """Write the ./.index.merkle file to the output dir"""
         """Write the ./.index.merkle file to the output dir"""
         # write self.generate_merkle_tree() to self.output_dir / '.index.merkle'
         # write self.generate_merkle_tree() to self.output_dir / '.index.merkle'
         print(f'{type(self).__name__}[{self.ABID}].save_merkle_index()')
         print(f'{type(self).__name__}[{self.ABID}].save_merkle_index()')
+        dir_info = get_dir_info(self.OUTPUT_DIR, max_depth=6)
+        with open(self.OUTPUT_DIR / '.hashes.json', 'w') as f:
+            json.dump(dir_info, f)
         pass
         pass
     
     
     def save_html_index(self, **kwargs) -> None:
     def save_html_index(self, **kwargs) -> None:
         # write self.as_html() to self.output_dir / 'index.html'
         # write self.as_html() to self.output_dir / 'index.html'
         print(f'{type(self).__name__}[{self.ABID}].save_html_index()')
         print(f'{type(self).__name__}[{self.ABID}].save_html_index()')
-        pass
+        (self.OUTPUT_DIR / 'index.html').write_text(self.as_html())
     
     
     def save_json_index(self, **kwargs) -> None:
     def save_json_index(self, **kwargs) -> None:
         print(f'{type(self).__name__}[{self.ABID}].save_json_index()')
         print(f'{type(self).__name__}[{self.ABID}].save_json_index()')
         # write self.as_json() to self.output_dir / 'index.json'
         # write self.as_json() to self.output_dir / 'index.json'
         (self.OUTPUT_DIR / 'index.json').write_text(to_json(self.as_json()))
         (self.OUTPUT_DIR / 'index.json').write_text(to_json(self.as_json()))
-        pass
     
     
     def save_symlinks_index(self) -> None:
     def save_symlinks_index(self) -> None:
         print(f'{type(self).__name__}[{self.ABID}].save_symlinks_index()')
         print(f'{type(self).__name__}[{self.ABID}].save_symlinks_index()')
@@ -610,26 +614,26 @@ class ModelWithOutputDir(ABIDModel):
         # ln -s self.output_dir data/archive/1453452234234.21445
         # ln -s self.output_dir data/archive/1453452234234.21445
         pass
         pass
 
 
-    def as_json(self) -> dict:
+    def as_json(self, *keys) -> dict:
         """Get the object's properties as a dict"""
         """Get the object's properties as a dict"""
-        # dump the object's properties to a json-ready dict
         return {
         return {
             'TYPE': self.TYPE,
             'TYPE': self.TYPE,
-            'id': self.id,
+            'id': str(self.id),
             'abid': str(self.ABID),
             'abid': str(self.ABID),
             'str': str(self),
             'str': str(self),
-            'modified_at': self.modified_at,
-            'created_at': self.created_at,
             'created_by_id': self.created_by_id,
             'created_by_id': self.created_by_id,
+            'created_at': self.created_at,
+            'modified_at': self.modified_at,
             'status': getattr(self, 'status', None),
             'status': getattr(self, 'status', None),
             'retry_at': getattr(self, 'retry_at', None),
             'retry_at': getattr(self, 'retry_at', None),
             'notes': getattr(self, 'notes', None),
             'notes': getattr(self, 'notes', None),
+            **{key: getattr(self, key) for key in keys},
         }
         }
     
     
     def as_html(self) -> str:
     def as_html(self) -> str:
         """Get the object's properties as a html string"""
         """Get the object's properties as a html string"""
         # render snapshot_detail.html template with self as context and return html string
         # render snapshot_detail.html template with self as context and return html string
-        return ''
+        return str(self)
 
 
 
 
 ####################################################
 ####################################################

+ 1 - 0
archivebox/config/common.py

@@ -104,6 +104,7 @@ SERVER_CONFIG = ServerConfig()
 
 
 class ArchivingConfig(BaseConfigSet):
 class ArchivingConfig(BaseConfigSet):
     ONLY_NEW: bool                        = Field(default=True)
     ONLY_NEW: bool                        = Field(default=True)
+    OVERWRITE: bool                       = Field(default=False)
     
     
     TIMEOUT: int                          = Field(default=60)
     TIMEOUT: int                          = Field(default=60)
     MEDIA_TIMEOUT: int                    = Field(default=3600)
     MEDIA_TIMEOUT: int                    = Field(default=3600)

+ 113 - 6
archivebox/core/models.py

@@ -16,7 +16,7 @@ from django.utils.text import slugify
 from django.utils import timezone
 from django.utils import timezone
 from django.core.cache import cache
 from django.core.cache import cache
 from django.urls import reverse, reverse_lazy
 from django.urls import reverse, reverse_lazy
-from django.db.models import Case, When, Value, IntegerField
+from django.db.models import Case, When, IntegerField
 from django.contrib import admin
 from django.contrib import admin
 from django.conf import settings
 from django.conf import settings
 
 
@@ -25,7 +25,8 @@ import abx
 
 
 from archivebox.config import CONSTANTS
 from archivebox.config import CONSTANTS
 from archivebox.misc.system import get_dir_size
 from archivebox.misc.system import get_dir_size
-from archivebox.misc.util import parse_date, base_url
+from archivebox.misc.util import parse_date, base_url, domain as url_domain
+from archivebox.misc.hashing import get_dir_info
 from archivebox.index.schema import Link
 from archivebox.index.schema import Link
 from archivebox.index.html import snapshot_icons
 from archivebox.index.html import snapshot_icons
 from archivebox.extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE
 from archivebox.extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE
@@ -142,8 +143,20 @@ def validate_timestamp(value):
     assert value.replace('.', '').isdigit(), f'timestamp must be a float str, got: "{value}"'
     assert value.replace('.', '').isdigit(), f'timestamp must be a float str, got: "{value}"'
 
 
 class SnapshotManager(models.Manager):
 class SnapshotManager(models.Manager):
+    def filter(self, *args, **kwargs):
+        """add support for .filter(domain='example.com') to Snapshot queryset"""
+        domain = kwargs.pop('domain', None)
+        qs = super().filter(*args, **kwargs)
+        if domain:
+            qs = qs.filter(url__icontains=f'://{domain}')
+        return qs
+    
     def get_queryset(self):
     def get_queryset(self):
-        return super().get_queryset().prefetch_related('tags', 'archiveresult_set')  # .annotate(archiveresult_count=models.Count('archiveresult')).distinct()
+        return (
+            super().get_queryset()
+                .prefetch_related('tags', 'archiveresult_set') 
+                # .annotate(archiveresult_count=models.Count('archiveresult')).distinct()
+        )
 
 
 class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithStateMachine, ABIDModel):
 class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithStateMachine, ABIDModel):
     abid_prefix = 'snp_'
     abid_prefix = 'snp_'
@@ -256,6 +269,13 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithStateMachine, ABIDM
         if self.crawl and self.url not in self.crawl.urls:
         if self.crawl and self.url not in self.crawl.urls:
             self.crawl.urls += f'\n{self.url}'
             self.crawl.urls += f'\n{self.url}'
             self.crawl.save()
             self.crawl.save()
+            
+            
+    def output_dir_parent(self) -> str:
+        return 'archive'
+    
+    def output_dir_name(self) -> str:
+        return str(self.timestamp)
 
 
     def archive(self, overwrite=False, methods=None):
     def archive(self, overwrite=False, methods=None):
         result = bg_archive_snapshot(self, overwrite=overwrite, methods=methods)
         result = bg_archive_snapshot(self, overwrite=overwrite, methods=methods)
@@ -338,6 +358,10 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithStateMachine, ABIDM
     def bookmarked_date(self):
     def bookmarked_date(self):
         # TODO: remove this
         # TODO: remove this
         return self.bookmarked
         return self.bookmarked
+    
+    @cached_property
+    def domain(self) -> str:
+        return url_domain(self.url)
 
 
     @cached_property
     @cached_property
     def is_archived(self):
     def is_archived(self):
@@ -659,7 +683,8 @@ class ArchiveResult(ModelWithConfig, ModelWithOutputDir, ModelWithStateMachine,
     notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this ArchiveResult should have')
     notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this ArchiveResult should have')
 
 
     # the network interface that was used to download this result
     # the network interface that was used to download this result
-    # uplink = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True, verbose_name='Network Interface Used')
+    # machine = models.ForeignKey(Machine, on_delete=models.SET_NULL, null=True, blank=True, verbose_name='Machine Used')
+    # network = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True, verbose_name='Network Interface Used')
 
 
     objects = ArchiveResultManager()
     objects = ArchiveResultManager()
     
     
@@ -742,8 +767,7 @@ class ArchiveResult(ModelWithConfig, ModelWithOutputDir, ModelWithStateMachine,
             return None
             return None
 
 
     def legacy_output_path(self):
     def legacy_output_path(self):
-        link = self.snapshot.as_link()
-        return link.canonical_outputs().get(f'{self.extractor}_path')
+        return self.canonical_outputs().get(f'{self.extractor}_path')
 
 
     def output_exists(self) -> bool:
     def output_exists(self) -> bool:
         output_path = Path(self.snapshot_dir) / self.extractor
         output_path = Path(self.snapshot_dir) / self.extractor
@@ -761,6 +785,89 @@ class ArchiveResult(ModelWithConfig, ModelWithOutputDir, ModelWithStateMachine,
             for key in args
             for key in args
         }
         }
         
         
+    def canonical_outputs(self) -> Dict[str, Optional[str]]:
+        """Predict the expected output paths that should be present after archiving"""
+        # You'll need to implement the actual logic based on your requirements
+        # TODO: banish this awful duplication from the codebase and import these
+        # from their respective extractor files
+
+
+        from abx_plugin_favicon.config import FAVICON_CONFIG
+        canonical = {
+            'index_path': 'index.html',
+            'favicon_path': 'favicon.ico',
+            'google_favicon_path': FAVICON_CONFIG.FAVICON_PROVIDER.format(self.domain),
+            'wget_path': f'warc/{self.timestamp}',
+            'warc_path': 'warc/',
+            'singlefile_path': 'singlefile.html',
+            'readability_path': 'readability/content.html',
+            'mercury_path': 'mercury/content.html',
+            'htmltotext_path': 'htmltotext.txt',
+            'pdf_path': 'output.pdf',
+            'screenshot_path': 'screenshot.png',
+            'dom_path': 'output.html',
+            'archive_org_path': f'https://web.archive.org/web/{self.base_url}',
+            'git_path': 'git/',
+            'media_path': 'media/',
+            'headers_path': 'headers.json',
+        }
+        
+        if self.is_static:
+            static_path = f'warc/{self.timestamp}'
+            canonical.update({
+                'title': self.basename,
+                'wget_path': static_path,
+                'pdf_path': static_path,
+                'screenshot_path': static_path,
+                'dom_path': static_path,
+                'singlefile_path': static_path,
+                'readability_path': static_path,
+                'mercury_path': static_path,
+                'htmltotext_path': static_path,
+            })
+        return canonical
+        
+    @property
+    def output_dir_name(self) -> str:
+        return self.extractor
+        
+    @property
+    def output_dir_parent(self) -> str:
+        return str(self.snapshot.OUTPUT_DIR.relative_to(CONSTANTS.DATA_DIR))
+        
+    @cached_property
+    def output_files(self) -> dict[str, dict]:
+        dir_info = get_dir_info(self.OUTPUT_DIR, max_depth=6)
+        with open(self.OUTPUT_DIR / '.hashes.json', 'w') as f:
+            json.dump(dir_info, f)
+        return dir_info
+    
+    def announce_event(self, output_type: str, event: dict):
+        event = {
+            **event,
+            'type': output_type,
+        }
+        
+        # if event references a file, make sure it exists on disk
+        if 'path' in event:
+            file_path = Path(self.OUTPUT_DIR) / event['path']
+            assert file_path.exists(), f'ArchiveResult[{self.ABID}].announce_event(): File does not exist: {file_path} ({event})'
+            
+        with open(self.OUTPUT_DIR / '.events.jsonl', 'a') as f:
+            f.write(json.dumps(event, sort_keys=True, default=str) + '\n')
+            
+    def events(self, filter_type: str | None=None) -> list[dict]:
+        events = []
+        try:
+            with open(self.OUTPUT_DIR / '.events.jsonl', 'r') as f:
+                for line in f:
+                    event = json.loads(line)
+                    if filter_type is None or event['type'] == filter_type:
+                        events.append(event)
+        except FileNotFoundError:
+            pass
+        return events
+        
     def write_indexes(self):
     def write_indexes(self):
         """Write the ArchiveResult json, html, and merkle indexes to output dir, and pass searchable text to the search backend"""
         """Write the ArchiveResult json, html, and merkle indexes to output dir, and pass searchable text to the search backend"""
         super().write_indexes()
         super().write_indexes()

+ 5 - 4
archivebox/extractors/extractor.py

@@ -2,12 +2,15 @@ import hashlib
 import mimetypes
 import mimetypes
 import os
 import os
 
 
+import subprocess
 from typing import ClassVar
 from typing import ClassVar
 from datetime import timedelta
 from datetime import timedelta
 from zipfile import Path
 from zipfile import Path
 
 
 from django.utils import timezone
 from django.utils import timezone
 
 
+from archivebox.misc.hashing import get_dir_info
+
 from core.models import ArchiveResult
 from core.models import ArchiveResult
 
 
 import abx
 import abx
@@ -205,9 +208,7 @@ class Extractor:
     
     
     def after_extract(self, error: Exception | None=None):
     def after_extract(self, error: Exception | None=None):
         status, retry_at = self.determine_status()
         status, retry_at = self.determine_status()
-        
-        self.archiveresult.outputs = []
-        
+
         
         
         self.archiveresult.error = f'{type(error).__name__}: {error}' if error else None
         self.archiveresult.error = f'{type(error).__name__}: {error}' if error else None
         self.archiveresult.status = self.archiveresult.StatusChoices.FAILED if error else self.archiveresult.StatusChoices.SUCCEEDED
         self.archiveresult.status = self.archiveresult.StatusChoices.FAILED if error else self.archiveresult.StatusChoices.SUCCEEDED
@@ -216,4 +217,4 @@ class Extractor:
         self.archiveresult.output = self.archiveresult.outputs[0].path
         self.archiveresult.output = self.archiveresult.outputs[0].path
         self.archiveresult.save()
         self.archiveresult.save()
         self.archiveresult.write_indexes()
         self.archiveresult.write_indexes()
-    
+    

+ 96 - 53
archivebox/filestore/models.py

@@ -1,67 +1,110 @@
-# import mimetypes
-# import uuid
+import mimetypes
+import uuid
+from datetime import timedelta
+from pathlib import Path
+from django.db import models
+from django.conf import settings
+from django.utils import timezone
 
 
-# from django.db import models
-# from django.conf import settings
-# from django.utils import timezone
+from archivebox import DATA_DIR
+from archivebox.misc.hashing import get_dir_info, hash_file
+from base_models.abid import DEFAULT_ABID_URI_SALT
+from base_models.models import ABIDModel, ABIDField, get_or_create_system_user_pk
 
 
-# from archivebox import DATA_DIR
-# from archivebox.misc.hashing import get_dir_info, hash_file
-# from base_models.abid import DEFAULT_ABID_URI_SALT
-# from base_models.models import ABIDModel, ABIDField, get_or_create_system_user_pk
 
 
-
-# class File(ABIDModel):
-#     abid_prefix = 'fil_'
-#     abid_ts_src = 'self.created_at'
-#     abid_uri_src = 'self.path'
-#     abid_subtype_src = 'self.mime_type'
-#     abid_rand_src = 'self.id'
-#     abid_salt: str = DEFAULT_ABID_URI_SALT           # combined with self.uri to anonymize hashes on a per-install basis (default is shared globally with all users, means everyone will hash ABC to -> 123 the same around the world, makes it easy to share ABIDs across installs and see if they are for the same URI. Change this if you dont want your hashes to be guessable / in the same hash space as all other users)
-#     abid_drift_allowed: bool = False        
+class File(ABIDModel):
+    abid_prefix = 'fil_'
+    abid_ts_src = 'self.created_at'
+    abid_uri_src = 'self.path'
+    abid_subtype_src = 'self.mime_type'
+    abid_rand_src = 'self.id'
+    abid_salt: str = DEFAULT_ABID_URI_SALT           # combined with self.uri to anonymize hashes on a per-install basis (default is shared globally with all users, means everyone will hash ABC to -> 123 the same around the world, makes it easy to share ABIDs across installs and see if they are for the same URI. Change this if you dont want your hashes to be guessable / in the same hash space as all other users)
+    abid_drift_allowed: bool = False        
     
     
-#     id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False, null=False)
-#     abid = ABIDField(prefix=abid_prefix)
+    id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False, null=False)
+    abid = ABIDField(prefix=abid_prefix)
 
 
-#     created_at = models.DateTimeField(default=timezone.now, null=False)
-#     modified_at = models.DateTimeField(default=timezone.now, null=False)
-#     created_by = models.ForeignKey(settings.USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk)
+    created_at = models.DateTimeField(default=timezone.now, null=False)
+    modified_at = models.DateTimeField(default=timezone.now, null=False)
+    created_by = models.ForeignKey(settings.USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk)
+    
+    class StatusChoices(models.TextChoices):
+        UNLOCKED = 'unlocked'
+        LOCKED = 'locked'
+    
+    status = models.CharField(max_length=16, choices=StatusChoices.choices, default=StatusChoices.UNLOCKED, null=False)
+    retry_at = models.DateTimeField(default=None, null=True)
+    version = models.CharField(max_length=16, default='unknown', null=False)
     
     
-#     path = models.FilePathField(path=str(DATA_DIR), recursive=True, allow_files=True, allow_folders=True, db_index=True, unique=True)
+    file = models.FileField(null=False)
     
     
-#     basename = models.CharField(max_length=255, default=None, null=False)                     # e.g. 'index'
-#     extension = models.CharField(max_length=63, default='', null=False)                       # e.g. 'html'
-#     mime_type = models.CharField(max_length=63, default=None, null=False, db_index=True)      # e.g. 'inode/directory' or 'text/html'
-#     num_subpaths = models.IntegerField(default=None, null=False)                              # e.g. 3
-#     num_bytes = models.IntegerField(default=None, null=False)                                 # e.g. 123456
+    basename = models.CharField(max_length=255, default=None, null=False)                     # e.g. 'index'
+    extension = models.CharField(max_length=63, default='', null=False)                       # e.g. 'html'
+    mime_type = models.CharField(max_length=63, default=None, null=False, db_index=True)      # e.g. 'inode/directory' or 'text/html'
+    num_subpaths = models.IntegerField(default=None, null=False)                              # e.g. 3
+    num_bytes = models.IntegerField(default=None, null=False)                                 # e.g. 123456
     
     
-#     hash_sha256 = models.CharField(max_length=64, default=None, null=False, db_index=True)    # e.g. '5994471abb01112afcc1815994471abb01112afcc1815994471abb01112afcc181'
-#     # hash_blake3 = models.CharField(max_length=64, default=None, null=False, db_index=True)  # e.g. '5994471abb01112afcc1815994471abb01112afcc1815994471abb01112afcc181'
+    sha256 = models.CharField(max_length=64, default=None, null=False, db_index=True)    # e.g. '5994471abb01112afcc1815994471abb01112afcc1815994471abb01112afcc181'
+    # blake3 = models.CharField(max_length=64, default=None, null=False, db_index=True)  # e.g. '5994471abb01112afcc1815994471abb01112afcc1815994471abb01112afcc181'
     
     
-#     DIR = 'inode/directory'
+    DIR = 'inode/directory'
 
 
+    @classmethod
+    def release_expired_locks(cls):
+        cls.objects.filter(status='locked', retry_at__lt=timezone.now()).update(status='unlocked', retry_at=None)
 
 
-#     @property
-#     def parent(self) -> 'File':
-#         return File.objects.get(path=self.path.parent) or File(path=self.path.parent)
+    @property
+    def parent(self) -> 'File':
+        return File.objects.get(path=str(self.PATH.parent)) or File(path=str(self.PATH.parent))
+    
+    @property
+    def relpath(self) -> Path:
+        return Path(self.file.name)
+    
+    @property
+    def abspath(self) -> Path:
+        return DATA_DIR / self.file.name
 
 
-#     def save(self, *args, **kwargs):
-#         assert self.path.exists()
+    def save(self, *args, **kwargs):
+        assert self.abspath.exists()
         
         
-#         if self.path.is_dir():
-#             self.basename = self.path.name
-#             self.extension = ''
-#             self.mime_type = self.DIR
-#             dir_info = get_dir_info(self.path)
-#             self.num_subpaths = dir_info['.']['num_subpaths']
-#             self.num_bytes = dir_info['.']['num_bytes']
-#             self.hash_sha256 = dir_info['.']['hash_sha256']
-#             # TODO: hash_blake3 = dir_info['.']['hash_blake3']
-#         else:
-#             self.basename = self.path.name
-#             self.extension = self.path.suffix
-#             self.mime_type = mimetypes.guess_type(self.path)[0]
-#             self.num_bytes = self.path.stat().st_size
-#             self.hash_sha256, self.hash_blake3 = hash_file(self.path)
-#         super().save(*args, **kwargs)
+        if self.abspath.is_dir():
+            self.basename = self.relpath.name
+            self.extension = ''
+            self.mime_type = self.DIR
+            dir_info = get_dir_info(self.abspath)
+            self.num_subpaths = dir_info['.']['num_subpaths']
+            self.num_bytes = dir_info['.']['num_bytes']
+            self.hash_sha256 = dir_info['.']['hash_sha256']
+            # TODO: hash_blake3 = dir_info['.']['hash_blake3']
+        else:
+            self.basename = self.relpath.name
+            self.extension = self.relpath.suffix
+            self.mime_type = mimetypes.guess_type(self.abspath)[0]
+            self.num_bytes = self.abspath.stat().st_size
+            self.hash_sha256, self.hash_blake3 = hash_file(self.abspath)
+        super().save(*args, **kwargs)
+            
+
+    def acquire_lock(self, timeout_seconds: int = 60):
+        self.status = 'locked'
+        self.retry_at = timezone.now() + timedelta(seconds=timeout_seconds)
+        self.save()
+
+    def release_lock(self):
+        self.status = 'unlocked'
+        self.retry_at = None
+        self.save()
+
+    def move_to(self, new_path: Path):
+        if str(new_path).startswith(str(DATA_DIR)):
+            new_relpath = new_path.relative_to(DATA_DIR)
+            new_abspath = new_path
+        else:
+            new_relpath = new_path
+            new_abspath = DATA_DIR / new_path
             
             
+        new_abspath.parent.mkdir(parents=True, exist_ok=True)
+        self.abspath.rename(new_abspath)
+        self.file.name = new_relpath
+        self.save()

+ 7 - 0
archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/binaries.py

@@ -1,3 +1,6 @@
+#!/usr/bin/env python3
+
+__package__ = 'abx_plugin_chrome'
 import os
 import os
 import platform
 import platform
 from pathlib import Path
 from pathlib import Path
@@ -147,3 +150,7 @@ class ChromeBinary(Binary):
 
 
 CHROME_BINARY = ChromeBinary()
 CHROME_BINARY = ChromeBinary()
 
 
+
+if __name__ == '__main__':
+    binary = CHROME_BINARY.load()
+    print(binary.version, '  ', binary.abspath)

+ 81 - 11
archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/config.py

@@ -1,5 +1,8 @@
+#!/usr/bin/env python3
+
 import os
 import os
 from pathlib import Path
 from pathlib import Path
+import sys
 from typing import List, Optional
 from typing import List, Optional
 
 
 from pydantic import Field
 from pydantic import Field
@@ -79,16 +82,67 @@ class ChromeConfig(BaseConfigSet):
     # Chrome Binary
     # Chrome Binary
     CHROME_BINARY: str                      = Field(default='chrome')
     CHROME_BINARY: str                      = Field(default='chrome')
     CHROME_DEFAULT_ARGS: List[str]          = Field(default=[
     CHROME_DEFAULT_ARGS: List[str]          = Field(default=[
+        "--disable-sync",
+        "--no-pings",
         "--no-first-run",                                              # dont show any first run ui / setup prompts
         "--no-first-run",                                              # dont show any first run ui / setup prompts
-        '--virtual-time-budget=15000',                                 # accellerate any animations on the page by 15s into the future
-        '--disable-features=DarkMode',                                 # disable dark mode for archiving
-        "--run-all-compositor-stages-before-draw",                     # dont draw partially rendered content, wait until everything is ready
-        "--hide-scrollbars",                                           # hide scrollbars to prevent layout shift / scrollbar visible in screenshots
-        "--autoplay-policy=no-user-gesture-required",                  # allow media autoplay without user gesture (e.g. on mobile)
-        "--use-fake-ui-for-media-stream",                              # provide fake camera if site tries to request camera access
+        "--no-default-browser-check",
+        "--disable-default-apps",
+        "--ash-no-nudges",
+        "--disable-infobars",
+        "--disable-blink-features=AutomationControlled",
+        "--js-flags=--random-seed=1157259159",
+        "--deterministic-mode",
+        "--deterministic-fetch",
+        "--start-maximized",
+        "--test-type=gpu",
+        "--disable-search-engine-choice-screen",
+        "--disable-session-crashed-bubble", 
+        "--hide-crash-restore-bubble",
+        "--suppress-message-center-popups",
+        "--disable-client-side-phishing-detection",
+        "--disable-domain-reliability",
+        "--disable-component-update",
+        "--disable-datasaver-prompt",
+        "--disable-hang-monitor",
+        "--disable-session-crashed-bubble",
+        "--disable-speech-synthesis-api",
+        "--disable-speech-api",
+        "--disable-print-preview",
+        "--safebrowsing-disable-auto-update",
+        "--deny-permission-prompts",
+        "--disable-external-intent-requests",
+        "--disable-notifications",
+        "--disable-desktop-notifications",
+        "--noerrdialogs",
+        "--disable-popup-blocking",
+        "--disable-prompt-on-repost",
+        "--silent-debugger-extension-api",
+        "--block-new-web-contents",
+        "--metrics-recording-only",
+        "--disable-breakpad",
+        "--run-all-compositor-stages-before-draw",
         "--use-fake-device-for-media-stream",                          # provide fake camera if site tries to request camera access
         "--use-fake-device-for-media-stream",                          # provide fake camera if site tries to request camera access
-        "--simulate-outdated-no-au='Tue, 31 Dec 2099 23:59:59 GMT'",   # ignore chrome updates
-        "--force-gpu-mem-available-mb=4096",                           # allows for longer full page screenshots https://github.com/puppeteer/puppeteer/issues/5530
+        "--simulate-outdated-no-au=Tue, 31 Dec 2099 23:59:59 GMT",   # ignore chrome updates
+        "--force-gpu-mem-available-mb=4096",     # allows for longer full page screenshots https://github.com/puppeteer/puppeteer/issues/5530
+        "--password-store=basic",
+        "--use-mock-keychain",
+        "--disable-cookie-encryption",
+        "--allow-legacy-extension-manifests",
+        "--disable-gesture-requirement-for-media-playback",
+        "--font-render-hinting=none",
+        "--force-color-profile=srgb",
+        "--disable-partial-raster",
+        "--disable-skia-runtime-opts",
+        "--disable-2d-canvas-clip-aa",
+        "--disable-lazy-loading",
+        "--disable-renderer-backgrounding",
+        "--disable-background-networking",
+        "--disable-background-timer-throttling",
+        "--disable-backgrounding-occluded-windows",
+        "--disable-ipc-flooding-protection",
+        "--disable-extensions-http-throttling",
+        "--disable-field-trial-config",
+        "--disable-back-forward-cache",
     ])
     ])
     CHROME_EXTRA_ARGS: List[str]           = Field(default=[])
     CHROME_EXTRA_ARGS: List[str]           = Field(default=[])
     
     
@@ -99,6 +153,7 @@ class ChromeConfig(BaseConfigSet):
     CHROME_RESOLUTION: str                  = Field(default=lambda: ARCHIVING_CONFIG.RESOLUTION)
     CHROME_RESOLUTION: str                  = Field(default=lambda: ARCHIVING_CONFIG.RESOLUTION)
     CHROME_CHECK_SSL_VALIDITY: bool         = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
     CHROME_CHECK_SSL_VALIDITY: bool         = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
     
     
+    
     # Cookies & Auth
     # Cookies & Auth
     CHROME_USER_AGENT: str                  = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
     CHROME_USER_AGENT: str                  = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
     CHROME_USER_DATA_DIR: Path | None       = Field(default=CONSTANTS.PERSONAS_DIR / 'Default' / 'chrome_profile')
     CHROME_USER_DATA_DIR: Path | None       = Field(default=CONSTANTS.PERSONAS_DIR / 'Default' / 'chrome_profile')
@@ -108,6 +163,8 @@ class ChromeConfig(BaseConfigSet):
     SAVE_SCREENSHOT: bool                   = Field(default=True, alias='FETCH_SCREENSHOT')
     SAVE_SCREENSHOT: bool                   = Field(default=True, alias='FETCH_SCREENSHOT')
     SAVE_DOM: bool                          = Field(default=True, alias='FETCH_DOM')
     SAVE_DOM: bool                          = Field(default=True, alias='FETCH_DOM')
     SAVE_PDF: bool                          = Field(default=True, alias='FETCH_PDF')
     SAVE_PDF: bool                          = Field(default=True, alias='FETCH_PDF')
+    
+    OVERWRITE: bool                         = Field(default=lambda: ARCHIVING_CONFIG.OVERWRITE)
 
 
     def validate(self):
     def validate(self):
         from archivebox.config.paths import create_and_chown_dir
         from archivebox.config.paths import create_and_chown_dir
@@ -147,7 +204,11 @@ class ChromeConfig(BaseConfigSet):
                 
                 
                 self.update_in_place(CHROME_USER_DATA_DIR=None)
                 self.update_in_place(CHROME_USER_DATA_DIR=None)
             
             
-
+    @property
+    def CHROME_ARGS(self) -> str:
+        # import shlex
+        # return '\n'.join(shlex.quote(arg) for arg in self.chrome_args())
+        return '\n'.join(self.chrome_args())
     def chrome_args(self, **options) -> List[str]:
     def chrome_args(self, **options) -> List[str]:
         """helper to build up a chrome shell command with arguments"""
         """helper to build up a chrome shell command with arguments"""
     
     
@@ -157,8 +218,8 @@ class ChromeConfig(BaseConfigSet):
     
     
         cmd_args = [*options.CHROME_DEFAULT_ARGS, *options.CHROME_EXTRA_ARGS]
         cmd_args = [*options.CHROME_DEFAULT_ARGS, *options.CHROME_EXTRA_ARGS]
     
     
-        if options.CHROME_HEADLESS:
-            cmd_args += ["--headless=new"]   # expects chrome version >= 111
+        # if options.CHROME_HEADLESS:
+        #     cmd_args += ["--headless"]   # expects chrome version >= 111
     
     
         if not options.CHROME_SANDBOX:
         if not options.CHROME_SANDBOX:
             # assume this means we are running inside a docker container
             # assume this means we are running inside a docker container
@@ -205,3 +266,12 @@ class ChromeConfig(BaseConfigSet):
 
 
 CHROME_CONFIG = ChromeConfig()
 CHROME_CONFIG = ChromeConfig()
 
 
+
+if __name__ == '__main__':
+    if len(sys.argv) > 1:
+        result = getattr(CHROME_CONFIG, sys.argv[1], '')
+        if callable(result):
+            result = result()
+        print(result)
+    else:
+        print(CHROME_CONFIG.model_dump_json(indent=4))