2
0
Эх сурвалжийг харах

Merge branch 'dev' into version-banner

Nick Sweeting 2 жил өмнө
parent
commit
14f10a0461

+ 1 - 0
.dockerignore

@@ -16,6 +16,7 @@ venv/
 .docker-venv/
 node_modules/
 
+docs/
 build/
 dist/
 brew_dist/

+ 13 - 6
.github/workflows/docker.yml

@@ -11,8 +11,7 @@ on:
 
 env:
   DOCKER_IMAGE: archivebox-ci
-
-
+      
 jobs:
   buildx:
     runs-on: ubuntu-latest
@@ -60,13 +59,11 @@ jobs:
         uses: docker/metadata-action@v5
         with:
           images: archivebox/archivebox,nikisweeting/archivebox
-          flavor: |
-              latest=auto
           tags: |
               type=ref,event=branch
               type=semver,pattern={{version}}
               type=semver,pattern={{major}}.{{minor}}
-              type=sha
+              type=raw,value=latest,enable={{is_default_branch}}
       
       - name: Build and push
         id: docker_build
@@ -78,8 +75,18 @@ jobs:
           push: ${{ github.event_name != 'pull_request' }}
           tags: ${{ steps.docker_meta.outputs.tags }}
           cache-from: type=local,src=/tmp/.buildx-cache
-          cache-to: type=local,dest=/tmp/.buildx-cache
+          cache-to: type=local,dest=/tmp/.buildx-cache-new
           platforms: linux/amd64,linux/arm64,linux/arm/v7
 
       - name: Image digest
         run: echo ${{ steps.docker_build.outputs.digest }}
+       
+      # This ugly bit is necessary if you don't want your cache to grow forever
+      # until it hits GitHub's limit of 5GB.
+      # Temp fix
+      # https://github.com/docker/build-push-action/issues/252
+      # https://github.com/moby/buildkit/issues/1896
+      - name: Move cache
+        run: |
+          rm -rf /tmp/.buildx-cache
+          mv /tmp/.buildx-cache-new /tmp/.buildx-cache

+ 12 - 12
Dockerfile

@@ -73,7 +73,8 @@ COPY --chown=root:root --chmod=755 package.json "$CODE_DIR/"
 RUN grep '"version": ' "${CODE_DIR}/package.json" | awk -F'"' '{print $4}' > /VERSION.txt
 
 # Force apt to leave downloaded binaries in /var/cache/apt (massively speeds up Docker builds)
-RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache
+RUN echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache \
+    && rm -f /etc/apt/apt.conf.d/docker-clean
 
 # Print debug info about build and save it to disk, for human eyes only, not used by anything else
 RUN (echo "[i] Docker build for ArchiveBox $(cat /VERSION.txt) starting..." \
@@ -123,7 +124,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.npm,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT \
     echo "[+] Installing Node $NODE_VERSION environment in $NODE_MODULES..." \
     && echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_${NODE_VERSION}.x nodistro main" >> /etc/apt/sources.list.d/nodejs.list \
-    && curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
+    && curl -fsSL "https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key" | gpg --dearmor | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
     && apt-get update -qq \
     && apt-get install -qq -y -t bookworm-backports --no-install-recommends \
         nodejs libatomic1 python3-minimal \
@@ -202,7 +203,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
     && chown -R $ARCHIVEBOX_USER "$PLAYWRIGHT_BROWSERS_PATH" \
     # Save version info
     && ( \
-        which chromium-browser && /usr/bin/chromium-browser --version \
+        which chromium-browser && /usr/bin/chromium-browser --version || /usr/lib/chromium/chromium --version \
         && echo -e '\n\n' \
     ) | tee -a /VERSION.txt
 
@@ -246,15 +247,15 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
 COPY --chown=root:root --chmod=755 "." "$CODE_DIR/"
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
     echo "[*] Installing PIP ArchiveBox package from $CODE_DIR..." \
-    && apt-get update -qq \
+    # && apt-get update -qq \
     # install C compiler to build deps on platforms that dont have 32-bit wheels available on pypi
-    && apt-get install -qq -y -t bookworm-backports --no-install-recommends \
-        build-essential  \
+    # && apt-get install -qq -y -t bookworm-backports --no-install-recommends \
+    #     build-essential  \
     # INSTALL ARCHIVEBOX python package globally from CODE_DIR, with all optional dependencies
     && pip install -e "$CODE_DIR"[sonic,ldap] \
     # save docker image size and always remove compilers / build tools after building is complete
-    && apt-get purge -y build-essential \
-    && apt-get autoremove -y \
+    # && apt-get purge -y build-essential \
+    # && apt-get autoremove -y \
     && rm -rf /var/lib/apt/lists/*
 
 ####################################################
@@ -276,11 +277,10 @@ ENV IN_DOCKER=True
 
 # Print version for nice docker finish summary
 RUN (echo -e "\n\n[√] Finished Docker build succesfully. Saving build summary in: /VERSION.txt" \
-    && echo -e "PLATFORM=${TARGETPLATFORM} ARCH=$(uname -m) ($(uname -s) ${TARGETARCH} ${TARGETVARIANT})" \
-    && echo -e "BUILD_END_TIME=$(date +"%Y-%m-%d %H:%M:%S %s") TZ=${TZ}\n\n" \
-    && "$CODE_DIR/bin/docker_entrypoint.sh" \
-        archivebox version 2>&1 \
+    && echo -e "PLATFORM=${TARGETPLATFORM} ARCH=$(uname -m) ($(uname -s) ${TARGETARCH} ${TARGETVARIANT})\n" \
+    && echo -e "BUILD_END_TIME=$(date +"%Y-%m-%d %H:%M:%S %s")\n\n" \
     ) | tee -a /VERSION.txt
+RUN "$CODE_DIR"/bin/docker_entrypoint.sh version 2>&1 | tee -a /VERSION.txt
 
 ####################################################
 

+ 77 - 48
archivebox/config.py

@@ -54,6 +54,7 @@ from .config_stubs import (
 
 ### Pre-Fetch Minimal System Config
 
+TIMEZONE = 'UTC'
 SYSTEM_USER = getpass.getuser() or os.getlogin()
 
 try:
@@ -82,7 +83,6 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
         'IN_QEMU':                  {'type': bool,  'default': False},
         'PUID':                     {'type': int,   'default': os.getuid()},
         'PGID':                     {'type': int,   'default': os.getgid()},
-        # TODO: 'SHOW_HINTS':       {'type:  bool,  'default': True},
     },
 
     'GENERAL_CONFIG': {
@@ -377,7 +377,7 @@ ALLOWED_IN_OUTPUT_DIR = {
     'static_index.json',
 }
 
-def get_version(config):
+def get_version(config) -> str:
     try:
         return importlib.metadata.version(__package__ or 'archivebox')
     except importlib.metadata.PackageNotFoundError:
@@ -392,58 +392,76 @@ def get_version(config):
 
     raise Exception('Failed to detect installed archivebox version!')
 
-def get_commit_hash(config):
+def get_commit_hash(config) -> Optional[str]:
+    try:
+        git_dir = config['PACKAGE_DIR'] / '../'
+        ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1]
+        commit_hash = git_dir.joinpath(ref).read_text().strip()
+        return commit_hash
+    except Exception:
+        pass
+
     try:
         return list((config['PACKAGE_DIR'] / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
     except Exception:
-        return None
+        pass
+    
+    return None
+
+def get_build_time(config) -> str:
+    if config['IN_DOCKER']:
+        docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0]
+        return docker_build_end_time
+
+    src_last_modified_unix_timestamp = (config['PACKAGE_DIR'] / 'config.py').stat().st_mtime
+    return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s')
 
-def get_version_releases(config):
+def get_versions_available_on_github(config):
     """
-    returns a dictionary containing the GitHub release data for 
+    returns a dictionary containing the ArchiveBox GitHub release info for
     the recommended upgrade version and the currently installed version
     """
+    
+    # we only want to perform the (relatively expensive) check for new versions
+    # when its most relevant, e.g. when the user runs a long-running command
+    subcommand_run_by_user = sys.argv[3]
+    long_running_commands = ('add', 'schedule', 'update', 'status', 'server')
+    if subcommand_run_by_user not in long_running_commands:
+        return None
+    
     github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases"
     response = requests.get(github_releases_api)
     if response.status_code != 200:
-        stderr('Failed to get release data from GitHub', color='lightyellow', config=config)
+        stderr(f'[!] Warning: GitHub API call to check for new ArchiveBox version failed! (status={response.status_code})', color='lightyellow', config=config)
         return None
+    all_releases = response.json()
 
-    releases = response.json()
-    installed_version = config['VERSION']
-    installed_version_parts = parse_tag_name(installed_version)
+    installed_version = parse_version_string(config['VERSION'])
 
     # find current version or nearest older version (to link to)
     current_version = None
-    for release in releases:
-        release_parts = parse_tag_name(release["tag_name"])
-        if release_parts <= installed_version_parts :
+    for idx, release in enumerate(all_releases):
+        release_version = parse_version_string(release["tag_name"])
+        if release_version <= installed_version:
             current_version = release
             break
 
-    current_version = current_version if current_version else releases[-1]
-
-    # find upgrade version
-    upgrade_version = None
-    smallest_version_diff = parse_tag_name(releases[0]["tag_name"])[1]
-    for release in releases:
-        release_parts = parse_tag_name(release["tag_name"])
-        major_version_diff = release_parts[1] - installed_version_parts[1]
-        if major_version_diff < smallest_version_diff:
-            smallest_version_diff = major_version_diff
-            if smallest_version_diff < 1:
-                break
-            upgrade_version = release
-
-    upgrade_version = upgrade_version if upgrade_version else releases[0]
+    current_version = current_version or releases[-1]
+    
+    # recommended version is whatever comes after current_version in the release list
+    # (perhaps too conservative to only recommend upgrading one version at a time, but it's safest)
+    try:
+        recommended_version = all_releases[idx+1]
+    except IndexError:
+        recommended_version = None
 
-    return {"upgrade_version": upgrade_version, "current_version": current_version}
+    return {"recommended_version": recommended_version, "current_version": current_version}
 
 def can_upgrade(config):
-    if config['VERSION_RELEASES']:
-        upgrade_version = parse_tag_name(config['VERSION_RELEASES']['upgrade_version']['tag_name'])
-        current_version = parse_tag_name(config['VERSION_RELEASES']['current_version']['tag_name'])
-        return upgrade_version > current_version
+    if config['VERSIONS_AVAILABLE'] and config['VERSIONS_AVAILABLE']['recommended_version']:
+        recommended_version = parse_version_string(config['VERSIONS_AVAILABLE']['recommended_version']['tag_name'])
+        current_version = parse_version_string(config['VERSIONS_AVAILABLE']['current_version']['tag_name'])
+        return recommended_version > current_version
     return False
 
 
@@ -473,11 +491,14 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
     'DIR_OUTPUT_PERMISSIONS':   {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')},
 
     'ARCHIVEBOX_BINARY':        {'default': lambda c: sys.argv[0] or bin_path('archivebox')},
-    'VERSION':                  {'default': lambda c: get_version(c)},
-    'VERSION_RELEASES':         {'default': lambda c: get_version_releases(c)},
-    'CAN_UPGRADE':              {'default': lambda c: can_upgrade(c)},
+
+    'VERSION':                  {'default': lambda c: get_version(c).split('+', 1)[0]},
     'COMMIT_HASH':              {'default': lambda c: get_commit_hash(c)},
+    'BUILD_TIME':               {'default': lambda c: get_build_time(c)},
     
+    'VERSIONS_AVAILABLE':       {'default': lambda c: get_versions_available_on_github(c)},
+    'CAN_UPGRADE':              {'default': lambda c: can_upgrade(c)},
+
     'PYTHON_BINARY':            {'default': lambda c: sys.executable},
     'PYTHON_ENCODING':          {'default': lambda c: sys.stdout.encoding.upper()},
     'PYTHON_VERSION':           {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])},
@@ -487,7 +508,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
     
     'SQLITE_BINARY':            {'default': lambda c: inspect.getfile(sqlite3)},
     'SQLITE_VERSION':           {'default': lambda c: sqlite3.version},
-    #'SQLITE_JOURNAL_MODE':      {'default': lambda c: 'wal'},         # set at runtime below, interesting but unused for now
+    #'SQLITE_JOURNAL_MODE':      {'default': lambda c: 'wal'},         # set at runtime below, interesting if changed later but unused for now because its always expected to be wal
     #'SQLITE_OPTIONS':           {'default': lambda c: ['JSON1']},     # set at runtime below
 
     'USE_CURL':                 {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])},
@@ -744,14 +765,11 @@ def load_config(defaults: ConfigDefaultDict,
 
     return extended_config
 
-# def write_config(config: ConfigDict):
-
-#     with open(os.path.join(config['OUTPUT_DIR'], CONFIG_FILENAME), 'w+') as f:
 
-def parse_tag_name(v):
-    """parses a version tag string formatted like 'vx.x.x'"""
+def parse_version_string(version: str) -> Tuple[int, int int]:
+    """parses a version tag string formatted like 'vx.x.x' into (major, minor, patch) ints"""
     base = v.split('+')[0].split('v')[-1] # remove 'v' prefix and '+editable' suffix
-    return tuple(int(part) for part in base.split('.'))
+    return tuple(int(part) for part in base.split('.'))[:3]
 
 
 # Logging Helpers
@@ -840,6 +858,7 @@ def find_chrome_binary() -> Optional[str]:
     # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
     # make sure data dir finding precedence order always matches binary finding order
     default_executable_paths = (
+        # '~/Library/Caches/ms-playwright/chromium-*/chrome-mac/Chromium.app/Contents/MacOS/Chromium',
         'chromium-browser',
         'chromium',
         '/Applications/Chromium.app/Contents/MacOS/Chromium',
@@ -1166,14 +1185,25 @@ if not CONFIG['CHECK_SSL_VALIDITY']:
 
 def check_system_config(config: ConfigDict=CONFIG) -> None:
     ### Check system environment
-    if config['USER'] == 'root':
+    if config['USER'] == 'root' or str(config['PUID']) == "0":
         stderr('[!] ArchiveBox should never be run as root!', color='red')
         stderr('    For more information, see the security overview documentation:')
         stderr('        https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root')
+        
+        if config['IN_DOCKER']:
+            attempted_command = ' '.join(sys.argv[:3])
+            stderr('')
+            stderr('    {lightred}Hint{reset}: When using Docker, you must run commands with {green}docker run{reset} instead of {lightyellow}docker exec{reset}, e.g.:'.format(**config['ANSI']))
+            stderr(f'        docker compose run archivebox {attempted_command}')
+            stderr(f'        docker compose exec --user=archivebox archivebox {attempted_command}')
+            stderr('        or')
+            stderr(f'        docker run -it -v ... -p ... archivebox/archivebox {attempted_command}')
+            stderr(f'        docker exec -it --user=archivebox <container id> /bin/bash')
+        
         raise SystemExit(2)
 
     ### Check Python environment
-    if sys.version_info[:3] < (3, 6, 0):
+    if sys.version_info[:3] < (3, 7, 0):
         stderr(f'[X] Python version is not new enough: {config["PYTHON_VERSION"]} (>3.6 is required)', color='red')
         stderr('    See https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
         raise SystemExit(2)
@@ -1249,7 +1279,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
 
     if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20:
         stderr(f'[!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={config["MEDIA_TIMEOUT"]} seconds)', color='red')
-        stderr('    Youtube-dl will fail to archive all media if set to less than ~20 seconds.')
+        stderr('    youtube-dl/yt-dlp will fail to archive any media if set to less than ~20 seconds.')
         stderr('    (Setting it somewhere over 60 seconds is recommended)')
         stderr()
         stderr('    If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
@@ -1337,8 +1367,7 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
         with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
             command = ' '.join(sys.argv)
             ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
-            f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
-
+            f.write(f"\n> {command}; TS={ts} VERSION={config['VERSION']} IN_DOCKER={config['IN_DOCKER']} IS_TTY={config['IS_TTY']}\n")
 
         if check_db:
             # Enable WAL mode in sqlite3

+ 12 - 9
archivebox/core/admin.py

@@ -48,22 +48,25 @@ class TagInline(admin.TabularInline):
 from django.contrib.admin.helpers import ActionForm
 from django.contrib.admin.widgets import AutocompleteSelectMultiple
 
-class AutocompleteTags:
-    model = Tag
-    search_fields = ['name']
+# WIP: commented out because broken by Django 3.1.2 -> 4.0 migration
+# class AutocompleteTags:
+#     model = Tag
+#     search_fields = ['name']
+#     name = 'tags'
 
-class AutocompleteTagsAdminStub:
-    name = 'admin'
+# class AutocompleteTagsAdminStub:
+#     name = 'admin'
 
 
 class SnapshotActionForm(ActionForm):
     tags = forms.ModelMultipleChoiceField(
         queryset=Tag.objects.all(),
         required=False,
-        widget=AutocompleteSelectMultiple(
-            AutocompleteTags(),
-            AutocompleteTagsAdminStub(),
-        ),
+        # WIP: commented out because broken by Django 3.1.2 -> 4.0 migration
+        # widget=AutocompleteSelectMultiple(
+        #     # AutocompleteTags(),
+        #     # AutocompleteTagsAdminStub(),
+        # ),
     )
 
     # TODO: allow selecting actions for specific extractors? is this useful?

+ 1 - 1
archivebox/core/apps.py

@@ -3,4 +3,4 @@ from django.apps import AppConfig
 
 class CoreConfig(AppConfig):
     name = 'core'
-    default_auto_field = 'django.db.models.UUIDField'
+    # default_auto_field = 'django.db.models.UUIDField'

+ 2 - 0
archivebox/core/settings.py

@@ -268,6 +268,8 @@ AUTH_PASSWORD_VALIDATORS = [
     {'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator'},
 ]
 
+# WIP: commented out because broken by Django 3.1.2 -> 4.0 migration
+# DEFAULT_AUTO_FIELD = 'django.db.models.UUIDField'
 
 ################################################################################
 ### Shell Settings

+ 1 - 1
archivebox/extractors/__init__.py

@@ -184,7 +184,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
                         link.url,
                         command,
                         ts
-                    ) + "\n"))
+                    ) + "\n" + str(e) + "\n"))
                     #f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
 
         # print('    ', stats)

+ 5 - 1
archivebox/logging_util.py

@@ -393,7 +393,11 @@ def log_link_archiving_finished(link: "Link", link_dir: str, is_new: bool, stats
     else:
         _LAST_RUN_STATS.succeeded += 1
 
-    size = get_dir_size(link_dir)
+    try:
+        size = get_dir_size(link_dir)
+    except FileNotFoundError:
+        size = (0, None, '0')
+
     end_ts = datetime.now(timezone.utc)
     duration = str(end_ts - start_ts).split('.')[0]
     print('        {black}{} files ({}) in {}s {reset}'.format(size[2], printable_filesize(size[0]), duration, **ANSI))

+ 27 - 16
archivebox/main.py

@@ -93,6 +93,8 @@ from .config import (
     SQL_INDEX_FILENAME,
     ALLOWED_IN_OUTPUT_DIR,
     SEARCH_BACKEND_ENGINE,
+    LDAP,
+    get_version,
     check_dependencies,
     check_data_folder,
     write_config_file,
@@ -100,6 +102,7 @@ from .config import (
     VERSION_RELEASES,
     CAN_UPGRADE,
     COMMIT_HASH,
+    BUILD_TIME,
     CODE_LOCATIONS,
     EXTERNAL_LOCATIONS,
     DATA_LOCATIONS,
@@ -220,31 +223,39 @@ def version(quiet: bool=False,
     
     if not quiet:
         # 0.7.1
-        # ArchiveBox v0.7.1 Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY)
-        # DEBUG=False IN_DOCKER=True IN_QEMU=False IS_TTY=True TZ=UTC FS_ATOMIC=True FS_REMOTE=False FS_PERMS=644 FS_USER=501:20 SEARCH_BACKEND=ripgrep
+        # ArchiveBox v0.7.1+editable COMMIT_HASH=951bba5 BUILD_TIME=2023-12-17 16:46:05 1702860365
+        # IN_DOCKER=False IN_QEMU=False ARCH=arm64 OS=Darwin PLATFORM=macOS-14.2-arm64-arm-64bit PYTHON=Cpython
+        # FS_ATOMIC=True FS_REMOTE=False FS_USER=501:20 FS_PERMS=644
+        # DEBUG=False IS_TTY=True TZ=UTC SEARCH_BACKEND=ripgrep LDAP=False
         
         p = platform.uname()
         print(
-            'ArchiveBox v{}'.format(VERSION),
-            *((COMMIT_HASH[:7],) if COMMIT_HASH else ()),
-            sys.implementation.name.title(),
-            p.system,
-            platform.platform(),
-            p.machine,
+            'ArchiveBox v{}'.format(get_version(CONFIG)),
+            *((f'COMMIT_HASH={COMMIT_HASH[:7]}',) if COMMIT_HASH else ()),
+            f'BUILD_TIME={BUILD_TIME}',
         )
-        OUTPUT_IS_REMOTE_FS = DATA_LOCATIONS['OUTPUT_DIR']['is_mount'] or DATA_LOCATIONS['ARCHIVE_DIR']['is_mount']
         print(
-            f'DEBUG={DEBUG}',
             f'IN_DOCKER={IN_DOCKER}',
             f'IN_QEMU={IN_QEMU}',
-            f'IS_TTY={IS_TTY}',
-            f'TZ={TIMEZONE}',
-            #f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})',  # add this if we have more useful info to show eventually
+            f'ARCH={p.machine}',
+            f'OS={p.system}',
+            f'PLATFORM={platform.platform()}',
+            f'PYTHON={sys.implementation.name.title()}',
+        )
+        OUTPUT_IS_REMOTE_FS = DATA_LOCATIONS['OUTPUT_DIR']['is_mount'] or DATA_LOCATIONS['ARCHIVE_DIR']['is_mount']
+        print(
             f'FS_ATOMIC={ENFORCE_ATOMIC_WRITES}',
             f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
             f'FS_USER={PUID}:{PGID}',
             f'FS_PERMS={OUTPUT_PERMISSIONS}',
+        )
+        print(
+            f'DEBUG={DEBUG}',
+            f'IS_TTY={IS_TTY}',
+            f'TZ={TIMEZONE}',
             f'SEARCH_BACKEND={SEARCH_BACKEND_ENGINE}',
+            f'LDAP={LDAP}',
+            #f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})',  # add this if we have more useful info to show eventually
         )
         print()
 
@@ -273,7 +284,7 @@ def version(quiet: bool=False,
                 print(printable_folder_status(name, path))
         else:
             print()
-            print('{white}[i] Data locations:{reset}'.format(**ANSI))
+            print('{white}[i] Data locations:{reset} (not in a data directory)'.format(**ANSI))
 
         print()
         check_dependencies()
@@ -1010,9 +1021,9 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
 
     stderr('\n    Installing SINGLEFILE_BINARY, READABILITY_BINARY, MERCURY_BINARY automatically using npm...')
     if not NODE_VERSION:
-        stderr('[X] You must first install node using your system package manager', color='red')
+        stderr('[X] You must first install node & npm using your system package manager', color='red')
         hint([
-            'curl -sL https://deb.nodesource.com/setup_15.x | sudo -E bash -',
+            'https://github.com/nodesource/distributions#table-of-contents',
             'or to disable all node-based modules run: archivebox config --set USE_NODE=False',
         ])
         raise SystemExit(1)

+ 1 - 0
archivebox/static

@@ -0,0 +1 @@
+templates/static

+ 8 - 1
bin/build_docker.sh

@@ -23,6 +23,7 @@ SUPPORTED_PLATFORMS="linux/amd64,linux/arm64,linux/arm/v7"
 TAG_NAME="${1:-$(git rev-parse --abbrev-ref HEAD)}"
 VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")"
 SHORT_VERSION="$(echo "$VERSION" | perl -pe 's/(\d+)\.(\d+)\.(\d+)/$1.$2/g')"
+GIT_SHA=sha-"$(git rev-parse --short HEAD)"
 SELECTED_PLATFORMS="${2:-$SUPPORTED_PLATFORMS}"
 
 echo "[+] Building Docker image: tag=$TAG_NAME version=$SHORT_VERSION arch=$SELECTED_PLATFORMS"
@@ -50,6 +51,7 @@ function create_builder() {
     docker buildx use xbuilder && return 0
     echo "[+] Creating new xbuilder for: $SELECTED_PLATFORMS"
     echo
+    docker pull 'moby/buildkit:buildx-stable-1'
 
     # Switch to buildx builder if already present / previously created
     docker buildx create --name xbuilder --driver docker-container --bootstrap --use --platform "$SELECTED_PLATFORMS" || true
@@ -74,6 +76,7 @@ echo "[+] Generating requirements.txt and pdm.lock from pyproject.toml..."
 pdm lock --group=':all' --strategy="cross_platform" --production
 pdm export --group=':all' --production --without-hashes -o requirements.txt
 
+
 echo "[+] Building archivebox:$VERSION docker image..."
 # docker builder prune
 # docker build . --no-cache -t archivebox-dev \
@@ -83,12 +86,16 @@ docker buildx build --platform "$SELECTED_PLATFORMS" --load . \
                -t archivebox/archivebox:$TAG_NAME \
                -t archivebox/archivebox:$VERSION \
                -t archivebox/archivebox:$SHORT_VERSION \
+               -t archivebox/archivebox:$GIT_SHA \
                -t archivebox/archivebox:latest \
                -t nikisweeting/archivebox \
                -t nikisweeting/archivebox:$TAG_NAME \
                -t nikisweeting/archivebox:$VERSION \
                -t nikisweeting/archivebox:$SHORT_VERSION \
+               -t nikisweeting/archivebox:$GIT_SHA \
                -t nikisweeting/archivebox:latest \
                -t ghcr.io/archivebox/archivebox/archivebox:$TAG_NAME \
                -t ghcr.io/archivebox/archivebox/archivebox:$VERSION \
-               -t ghcr.io/archivebox/archivebox/archivebox:$SHORT_VERSION
+               -t ghcr.io/archivebox/archivebox/archivebox:$SHORT_VERSION \
+               -t ghcr.io/archivebox/archivebox/archivebox:$GIT_SHA \
+               -t ghcr.io/archivebox/archivebox/archivebox:latest

+ 65 - 24
bin/docker_entrypoint.sh

@@ -1,20 +1,55 @@
 #!/bin/bash
 
+# This Docker ENTRYPOINT script is called by `docker run archivebox ...` or `docker compose run archivebox ...`.
+# It takes a CMD as $* shell arguments and runs it following these setup steps:
+
+# - Set the archivebox user to use the correct PUID & PGID
+#     1. highest precedence is for valid PUID and PGID env vars passsed in explicitly
+#     2. fall back to DETECTED_PUID of files found within existing data dir
+#     3. fall back to DEFAULT_PUID if no data dir or its owned by root
+# - Create a new /data dir if necessary and set the correct ownership on it
+# - Create a new /browsers dir if necessary and set the correct ownership on it
+# - Check whether we're running inside QEMU emulation and show a warning if so.
+# - Drop down to archivebox user permisisons and execute passed CMD command.
+
+# Bash Environment Setup
+# http://redsymbol.net/articles/unofficial-bash-strict-mode/
+# https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html
+# set -o xtrace
+# set -o nounset
+set -o errexit
+set -o errtrace
+set -o pipefail
+# IFS=$'\n'
+
+# Load global invariants (set by Dockerfile during image build time, not intended to be customized by users at runtime)
 export DATA_DIR="${DATA_DIR:-/data}"
 export ARCHIVEBOX_USER="${ARCHIVEBOX_USER:-archivebox}"
 
-# default PUID and PGID if data dir is empty and no PUID+PGID is set
+# Global default PUID and PGID if data dir is empty and no intended PUID+PGID is set manually by user
 export DEFAULT_PUID=911
 export DEFAULT_PGID=911
 
-# if data directory already exists, autodetect detect owner by looking at files within
-export DETECTED_UID="$(stat -c '%u' "$DATA_DIR/logs/errors.log" 2>/dev/null || echo "$DEFAULT_PUID")"
-export DETECTED_GID="$(stat -c '%g' "$DATA_DIR/logs/errors.log" 2>/dev/null || echo "$DEFAULT_PGID")"
+# If user tires to set PUID and PGID to root values manually, catch and reject because root is not allowed
+if [[ "$PUID" == "0" ]] || [[ "$PGID" == "0" ]]; then
+    echo -e "\n[X] Error: Got PUID=$PUID and PGID=$PGID but ArchiveBox is not allowed to be run as root, please change or unset PUID & PGID and try again." > /dev/stderr
+    echo -e "    Hint: some NFS/SMB/FUSE/etc. filesystems force-remap all permissions, leave PUID/PGID blank" > /dev/stderr
+    echo -e "          or set PUID/PGID to the same value as the user/group they remap to (e.g. $DEFAULT_PUID:$DEFAULT_PGID)." > /dev/stderr
+    echo -e "    https://linux.die.net/man/8/mount.cifs#:~:text=does%20not%20provide%20unix%20ownership" > /dev/stderr
+    exit 3
+fi
+
+# If data directory already exists, autodetect detect owner by looking at files within
+export DETECTED_PUID="$(stat -c '%u' "$DATA_DIR/logs/errors.log" 2>/dev/null || echo "$DEFAULT_PUID")"
+export DETECTED_PGID="$(stat -c '%g' "$DATA_DIR/logs/errors.log" 2>/dev/null || echo "$DEFAULT_PGID")"
+
+# If data directory exists but is owned by root, use defaults instead of root because root is not allowed
+[[ "$DETECTED_PUID" == "0" ]] && export DETECTED_PUID="$DEFAULT_PUID"
+[[ "$DETECTED_PGID" == "0" ]] && export DETECTED_PGID="$DEFAULT_PGID"
 
-# Set the archivebox user to use the configured UID & GID
-# prefers PUID and PGID env vars passsed in explicitly, falls back to autodetected defaults
-usermod -o -u "${PUID:-$DETECTED_UID}" "$ARCHIVEBOX_USER" > /dev/null 2>&1
-groupmod -o -g "${PGID:-$DETECTED_GID}" "$ARCHIVEBOX_USER" > /dev/null 2>&1
+# Set archivebox user and group ids to desired PUID/PGID
+usermod -o -u "${PUID:-$DETECTED_PUID}" "$ARCHIVEBOX_USER" > /dev/null 2>&1
+groupmod -o -g "${PGID:-$DETECTED_PGID}" "$ARCHIVEBOX_USER" > /dev/null 2>&1
 
 # re-set PUID and PGID to values reported by system instead of values we tried to set,
 # in case wonky filesystems or Docker setups try to play UID/GID remapping tricks on us
@@ -29,12 +64,12 @@ if [[ -d "$DATA_DIR/archive" ]]; then
         # echo "[√] Permissions are correct"
     else
      # the only time this fails is if the host filesystem doesn't allow us to write as root (e.g. some NFS mapall/maproot problems, connection issues, drive dissapeared, etc.)
-        echo -e "\n[X] Error: archivebox user (PUID=$PUID) is not able to write to your ./data dir." >&2
-        echo -e "    Change ./data to be owned by PUID=$PUID PGID=$PGID on the host and retry:"
-        echo -e "       \$ chown -R $PUID:$PGID ./data\n" >&2
-        echo -e "    Configure the PUID & PGID environment variables to change the desired owner:" >&2
-        echo -e "       https://docs.linuxserver.io/general/understanding-puid-and-pgid\n" >&2
-        exit 1
+        echo -e "\n[X] Error: archivebox user (PUID=$PUID) is not able to write to your ./data dir (currently owned by $(stat -c '%u' "$DATA_DIR"):$(stat -c '%g' "$DATA_DIR")." >&2
+        echo -e "    Change ./data to be owned by PUID=$PUID PGID=$PGID on the host and retry:" > /dev/stderr
+        echo -e "       \$ chown -R $PUID:$PGID ./data\n" > /dev/stderr
+        echo -e "    Configure the PUID & PGID environment variables to change the desired owner:" > /dev/stderr
+        echo -e "       https://docs.linuxserver.io/general/understanding-puid-and-pgid\n" > /dev/stderr
+        exit 3
     fi
 else
     # create data directory
@@ -46,29 +81,35 @@ fi
 chown $PUID:$PGID "$DATA_DIR"
 chown $PUID:$PGID "$DATA_DIR"/*
 
-# also chown BROWSERS_DIR because otherwise 'archivebox setup' wont be able to install chrome
+# also chown BROWSERS_DIR because otherwise 'archivebox setup' wont be able to install chrome at runtime
 PLAYWRIGHT_BROWSERS_PATH="${PLAYWRIGHT_BROWSERS_PATH:-/browsers}"
+mkdir -p "$PLAYWRIGHT_BROWSERS_PATH/permissions_test_safe_to_delete"
 chown $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH"
-chown $PUID:$PGID "${PLAYWRIGHT_BROWSERS_PATH}/*"
+chown $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH"/*
+rm -Rf "$PLAYWRIGHT_BROWSERS_PATH/permissions_test_safe_to_delete"
+
 
 # (this check is written in blood, QEMU silently breaks things in ways that are not obvious)
-export IN_QEMU="$(pmap 1 | grep qemu | wc -l | grep -E '^0$' >/dev/null && echo 'False' || echo 'True')"
-if [[ "$IN_QEMU" == 'True' ]]; then
-    echo -e "\n[!] Warning: Running $(uname -m) emulated container in QEMU, some things will break!" >&2
-    echo -e "    chromium (screenshot, pdf, dom), singlefile, and any dependencies that rely on inotify will not run in QEMU." >&2
-    echo -e "    See here for more info: https://github.com/microsoft/playwright/issues/17395#issuecomment-1250830493\n" >&2
+export IN_QEMU="$(pmap 1 | grep qemu >/dev/null && echo 'True' || echo 'False')"
+if [[ "$IN_QEMU" == "True" ]]; then
+    echo -e "\n[!] Warning: Running $(uname -m) docker image using QEMU emulation, some things will break!" > /dev/stderr
+    echo -e "    chromium (screenshot, pdf, dom), singlefile, and any dependencies that rely on inotify will not run in QEMU." > /dev/stderr
+    echo -e "    See here for more info: https://github.com/microsoft/playwright/issues/17395#issuecomment-1250830493\n" > /dev/stderr
 fi
 
+
 # Drop permissions to run commands as the archivebox user
 if [[ "$1" == /* || "$1" == "bash" || "$1" == "sh" || "$1" == "echo" || "$1" == "cat" || "$1" == "archivebox" ]]; then
     # handle "docker run archivebox /some/non-archivebox/command --with=some args" by passing args directly to bash -c
-    # e.g. "docker run archivebox /venv/bin/archivebox-alt init"
+    # e.g. "docker run archivebox archivebox init:
+    #      "docker run archivebox /venv/bin/archivebox-alt init"
     #      "docker run archivebox /bin/bash -c '...'"
-    #      "docker run archivebox echo test"
+    #      "docker run archivebox cat /VERSION.txt"
     exec gosu "$PUID" bash -c "$*"
 else
     # handle "docker run archivebox add some subcommand --with=args abc" by calling archivebox to run as args as CLI subcommand
-    # e.g. "docker run archivebox add --depth=1 https://example.com"
+    # e.g. "docker run archivebox help"
+    #      "docker run archivebox add --depth=1 https://example.com"
     #      "docker run archivebox manage createsupseruser"
     #      "docker run archivebox server 0.0.0.0:8000"
     exec gosu "$PUID" bash -c "archivebox $*"

+ 6 - 1
bin/release_docker.sh

@@ -18,6 +18,7 @@ SUPPORTED_PLATFORMS="linux/amd64,linux/arm64,linux/arm/v7"
 TAG_NAME="${1:-$(git rev-parse --abbrev-ref HEAD)}"
 VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")"
 SHORT_VERSION="$(echo "$VERSION" | perl -pe 's/(\d+)\.(\d+)\.(\d+)/$1.$2/g')"
+GIT_SHA=sha-"$(git rev-parse --short HEAD)"
 SELECTED_PLATFORMS="${2:-$SUPPORTED_PLATFORMS}"
 
 
@@ -34,12 +35,16 @@ docker buildx build --platform "$SELECTED_PLATFORMS" --push . \
                -t archivebox/archivebox:$TAG_NAME \
                -t archivebox/archivebox:$VERSION \
                -t archivebox/archivebox:$SHORT_VERSION \
+               -t archivebox/archivebox:$GIT_SHA \
                -t archivebox/archivebox:latest \
                -t nikisweeting/archivebox \
                -t nikisweeting/archivebox:$TAG_NAME \
                -t nikisweeting/archivebox:$VERSION \
                -t nikisweeting/archivebox:$SHORT_VERSION \
+               -t nikisweeting/archivebox:$GIT_SHA \
                -t nikisweeting/archivebox:latest \
                -t ghcr.io/archivebox/archivebox/archivebox:$TAG_NAME \
                -t ghcr.io/archivebox/archivebox/archivebox:$VERSION \
-               -t ghcr.io/archivebox/archivebox/archivebox:$SHORT_VERSION
+               -t ghcr.io/archivebox/archivebox/archivebox:$SHORT_VERSION \
+               -t ghcr.io/archivebox/archivebox/archivebox:$GIT_SHA
+

+ 1 - 2
docker-compose.yml

@@ -21,7 +21,6 @@ services:
             # - ./etc/crontabs:/var/spool/cron/crontabs  # uncomment this and archivebox_scheduler below to set up automatic recurring archive jobs
             # - ./archivebox:/app/archivebox             # uncomment this to mount the ArchiveBox source code at runtime (for developers working on archivebox)
         # build: .                                       # uncomment this to build the image from source code at buildtime (for developers working on archivebox)
-        
         environment:
             - ALLOWED_HOSTS=*                   # restrict this to only accept incoming traffic via specific domain name
             # - PUBLIC_INDEX=True               # set to False to prevent anonymous users from viewing snapshot list
@@ -161,4 +160,4 @@ networks:
         ipam:
             driver: default
             config:
-                - subnet: 172.20.0.0/24
+                - subnet: 172.20.0.0/24

+ 8 - 0
etc/crontabs/archivebox

@@ -0,0 +1,8 @@
+# DO NOT EDIT THIS FILE - edit the master and reinstall.
+# (/tmp/tmpe3dawo9u installed on Tue Jun 13 23:21:48 2023)
+# (Cron version -- $Id: crontab.c,v 2.13 1994/01/17 03:20:37 vixie Exp $)
+
+@daily cd /data && /usr/local/bin/archivebox add --depth=0 "https://example.com/3" >> /data/logs/schedule.log 2>&1 # archivebox_schedule
+@daily cd /data && /usr/local/bin/archivebox add --depth=0 "https://example.com/2" >> /data/logs/schedule.log 2>&1 # archivebox_schedule
+@daily cd /data && /usr/local/bin/archivebox add --depth=0 "https://example.com" >> /data/logs/schedule.log 2>&1 # archivebox_schedule
+@daily cd /data && /usr/local/bin/archivebox add --depth=0 "update" >> /data/logs/schedule.log 2>&1 # archivebox_schedule

Файлын зөрүү хэтэрхий том тул дарагдсан байна
+ 513 - 0
package-lock.json


Файлын зөрүү хэтэрхий том тул дарагдсан байна
+ 33 - 815
pdm.lock


+ 3 - 0
pyproject.toml

@@ -39,6 +39,9 @@ classifiers = [
     "Programming Language :: Python :: 3.7",
     "Programming Language :: Python :: 3.8",
     "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
     "Topic :: Internet :: WWW/HTTP",
     "Topic :: Internet :: WWW/HTTP :: Indexing/Search",
     "Topic :: Internet :: WWW/HTTP :: WSGI :: Application",

+ 22 - 14
requirements.txt

@@ -1,35 +1,41 @@
 # This file is @generated by PDM.
 # Please do not edit it manually.
 
-appnope==0.1.3
 asgiref==3.7.2
 asttokens==2.4.1
-brotli==1.1.0
-certifi==2023.7.22
+brotli==1.1.0; implementation_name == "cpython"
+brotlicffi==1.1.0.0; implementation_name != "cpython"
+certifi==2023.11.17
+cffi==1.16.0; implementation_name != "cpython"
 charset-normalizer==3.3.2
+colorama==0.4.6; sys_platform == "win32"
 croniter==2.0.1
-dateparser==1.1.8
+dateparser==1.2.0
 decorator==5.1.1
 django==3.1.14
+django-auth-ldap==4.1.0
 django-extensions==3.1.5
+exceptiongroup==1.2.0; python_version < "3.11"
 executing==2.0.1
-idna==3.4
-ipython==8.17.2
+idna==3.6
+ipython==8.18.1
 jedi==0.19.1
 matplotlib-inline==0.1.6
 mutagen==1.47.0
 mypy-extensions==1.0.0
 parso==0.8.3
-pexpect==4.8.0
-prompt-toolkit==3.0.40
-ptyprocess==0.7.0
+pexpect==4.9.0; sys_platform != "win32"
+prompt-toolkit==3.0.43
+ptyprocess==0.7.0; sys_platform != "win32"
 pure-eval==0.2.2
-pyasn1==0.5.0
+pyasn1==0.5.1
 pyasn1-modules==0.3.0
+pycparser==2.21; implementation_name != "cpython"
 pycryptodomex==3.19.0
-pygments==2.16.1
+pygments==2.17.2
 python-crontab==3.0.0
 python-dateutil==2.8.2
+python-ldap==3.4.4
 pytz==2023.3.post1
 regex==2023.10.3
 requests==2.31.0
@@ -37,10 +43,12 @@ six==1.16.0
 sonic-client==1.0.0
 sqlparse==0.4.4
 stack-data==0.6.3
-traitlets==5.13.0
+traitlets==5.14.0
+typing-extensions==4.9.0; python_version < "3.11"
+tzdata==2023.3; platform_system == "Windows"
 tzlocal==5.2
 urllib3==2.1.0
 w3lib==2.1.2
-wcwidth==0.2.10
+wcwidth==0.2.12
 websockets==12.0
-yt-dlp==2023.11.14
+yt-dlp==2023.11.16

Энэ ялгаанд хэт олон файл өөрчлөгдсөн тул зарим файлыг харуулаагүй болно