2
0
Эх сурвалжийг харах

bring image back down to 700mb

Nick Sweeting 1 жил өмнө
parent
commit
54d4d7f640
3 өөрчлөгдсөн 102 нэмэгдсэн , 52 устгасан
  1. 1 0
      .dockerignore
  2. 54 52
      Dockerfile
  3. 47 0
      bin/docker_layers.sh

+ 1 - 0
.dockerignore

@@ -37,6 +37,7 @@ docker/
 website/
 typings/
 
+tmp/
 data/
 data*/
 output/

+ 54 - 52
Dockerfile

@@ -28,8 +28,7 @@
 
 #########################################################################################
 
-FROM python:3.11-slim-bookworm
-# FROM debian:bookworm-backports  # Tried using faster bookworm-backports but wasn't worth it due to more frequent breakages: https://packages.debian.org/bookworm-backports/
+FROM ubuntu:24.04
 
 LABEL name="archivebox" \
     maintainer="Nick Sweeting <[email protected]>" \
@@ -55,7 +54,6 @@ ARG TARGETPLATFORM
 ARG TARGETOS
 ARG TARGETARCH
 ARG TARGETVARIANT
-
 ######### Environment Variables #################################
 
 # Global built-time and runtime environment constants + default pkg manager config
@@ -71,7 +69,7 @@ ENV TZ=UTC \
     npm_config_loglevel=error
 
 # Language Version config
-ENV PYTHON_VERSION=3.11 \
+ENV PYTHON_VERSION=3.12 \
     NODE_VERSION=22
 
 # Non-root User config
@@ -96,9 +94,6 @@ SHELL ["/bin/bash", "-o", "pipefail", "-o", "errexit", "-o", "errtrace", "-o", "
 # Detect ArchiveBox version number by reading pyproject.toml (also serves to invalidate the entire build cache whenever pyproject.toml changes)
 WORKDIR "$CODE_DIR"
 
-RUN --mount=type=bind,source=pyproject.toml,target=/app/pyproject.toml \
-    grep '^version = ' "/app/pyproject.toml" | awk -F'"' '{print $2}' > /VERSION.txt
-
 # Force apt to leave downloaded binaries in /var/cache/apt (massively speeds up back-to-back Docker builds)
 RUN echo 'Binary::apt::APT::Keep-Downloaded-Packages "1";' > /etc/apt/apt.conf.d/99keep-cache \
     && echo 'APT::Install-Recommends "0";' > /etc/apt/apt.conf.d/99no-intall-recommends \
@@ -106,7 +101,7 @@ RUN echo 'Binary::apt::APT::Keep-Downloaded-Packages "1";' > /etc/apt/apt.conf.d
     && rm -f /etc/apt/apt.conf.d/docker-clean
 
 # Print debug info about build and save it to disk, for human eyes only, not used by anything else
-RUN (echo "[i] Docker build for ArchiveBox $(cat /VERSION.txt) starting..." \
+RUN (echo "[i] Docker build for ArchiveBox starting..." \
     && echo "PLATFORM=${TARGETPLATFORM} ARCH=$(uname -m) ($(uname -s) ${TARGETARCH} ${TARGETVARIANT})" \
     && echo "BUILD_START_TIME=$(date +"%Y-%m-%d %H:%M:%S %s") TZ=${TZ} LANG=${LANG}" \
     && echo \
@@ -134,10 +129,9 @@ RUN echo "[*] Setting up $ARCHIVEBOX_USER user uid=${DEFAULT_PUID}..." \
 # Install system apt dependencies (adding backports to access more recent apt updates)
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \
     echo "[+] APT Installing base system dependencies for $TARGETPLATFORM..." \
-    && echo 'deb https://deb.debian.org/debian bookworm-backports main contrib non-free' > /etc/apt/sources.list.d/backports.list \
     && mkdir -p /etc/apt/keyrings \
     && apt-get update -qq \
-    && apt-get install -qq -y -t bookworm-backports \
+    && apt-get install -qq -y \
         # 1. packaging dependencies
         apt-transport-https ca-certificates apt-utils gnupg2 curl wget \
         # 2. docker and init system dependencies
@@ -147,6 +141,27 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
         # nano iputils-ping dnsutils htop procps jq yq
     && rm -rf /var/lib/apt/lists/*
 
+# Install apt binary dependencies for exractors
+# COPY --from=selenium/ffmpeg:latest /usr/local/bin/ffmpeg /usr/local/bin/ffmpeg
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \
+    echo "[+] APT Installing extractor dependencies for $TARGETPLATFORM..." \
+    && apt-get update -qq \
+    && apt-get install -qq -y --no-install-recommends \
+        git ffmpeg ripgrep \
+        # Packages we have also needed in the past:
+        # youtube-dl wget2 aria2 python3-pyxattr rtmpdump libfribidi-bin mpv \
+        # curl wget (already installed above)
+    && rm -rf /var/lib/apt/lists/* \
+    # Save version info
+    && ( \
+        which curl && curl --version | head -n1 \
+        && which wget && wget --version 2>&1 | head -n1 \
+        && which git && git --version 2>&1 | head -n1 \
+        && which ffmpeg && (ffmpeg --version 2>&1 | head -n1) || true \
+        && which rg && rg --version 2>&1 | head -n1 \
+        && echo -e '\n\n' \
+    ) | tee -a /VERSION.txt
+
 # Install sonic search backend
 COPY --from=archivebox/sonic:1.4.9 /usr/local/bin/sonic /usr/local/bin/sonic
 COPY --chown=root:root --chmod=755 "etc/sonic.cfg" /etc/sonic.cfg
@@ -160,7 +175,7 @@ RUN (which sonic && sonic --version) | tee -a /VERSION.txt
 #    --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
 # RUN echo "[+] APT Installing PYTHON $PYTHON_VERSION for $TARGETPLATFORM (skipped, provided by base image)..." \
     # && apt-get update -qq \
-    # && apt-get install -qq -y -t bookworm-backports --no-upgrade \
+    # && apt-get install -qq -y --no-upgrade \
     #     python${PYTHON_VERSION} python${PYTHON_VERSION}-minimal python3-pip python${PYTHON_VERSION}-venv pipx \
     # && rm -rf /var/lib/apt/lists/* \
     # tell PDM to allow using global system python site packages
@@ -188,8 +203,8 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
     && echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_${NODE_VERSION}.x nodistro main" >> /etc/apt/sources.list.d/nodejs.list \
     && curl -fsSL "https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key" | gpg --dearmor | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
     && apt-get update -qq \
-    && apt-get install -qq -y -t bookworm-backports --no-upgrade libatomic1 \
-    && apt-get install -y -t bookworm-backports --no-upgrade \
+    && apt-get install -qq -y --no-upgrade libatomic1 \
+    && apt-get install -y --no-upgrade \
         nodejs \
     && rm -rf /var/lib/apt/lists/* \
     # Update NPM to latest version
@@ -205,25 +220,23 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
 # Set up uv and main app /venv
 COPY --from=ghcr.io/astral-sh/uv:0.5 /uv /uvx /bin/
 ENV UV_COMPILE_BYTECODE=1 \
+    UV_PYTHON_PREFERENCE=only-system \
     UV_LINK_MODE=copy \
-    UV_PROJECT_ENVIRONMENT=/venv \
-    PATH="/venv/bin:$PATH"
+    UV_PROJECT_ENVIRONMENT=/venv
 WORKDIR "$CODE_DIR"
 # COPY --chown=root:root --chmod=755 pyproject.toml "$CODE_DIR/"
 RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \
     echo "[+] UV Creating /venv using python ${PYTHON_VERSION} for ${TARGETPLATFORM} (provided by base image)..." \
-    && uv venv \
-    && uv pip install setuptools pip \
-    && ln -s /venv "$CODE_DIR/.venv" \
+    && uv venv /venv
+ENV VIRTUAL_ENV=/venv PATH="/venv/bin:$PATH"
+RUN uv pip install setuptools pip \
     && ( \
-        which python3 && python3 --version | grep " $PYTHON_VERSION" \
-        && which pip && pip --version \
+        which python3 && python3 --version \
         && which uv && uv version \
         && echo -e '\n\n' \
     ) | tee -a /VERSION.txt
 
 
-
 ######### ArchiveBox & Extractor Dependencies ##################################
 
 # Install ArchiveBox C-compiled/apt-installed Python dependencies in app /venv (currently only used for python-ldap)
@@ -233,41 +246,24 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
     #--mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
     echo "[+] APT Installing + Compiling python3-ldap for PIP archivebox[ldap] on ${TARGETPLATFORM}..." \
     && apt-get update -qq \
-    && apt-get install -qq -y -t bookworm-backports --no-install-recommends \
+    && apt-get install -qq -y --no-install-recommends \
         build-essential gcc \
-        libssl-dev libldap2-dev libsasl2-dev python3-ldap \
+        python3-dev libssl-dev libldap2-dev libsasl2-dev python3-ldap \
         python3-msgpack python3-mutagen python3-regex python3-pycryptodome procps \
     && uv pip install \
         "python-ldap>=3.4.3" \
     && apt-get purge -y \
-        build-essential gcc \
+        python3-dev build-essential gcc \
     && apt-get autoremove -y \
     && rm -rf /var/lib/apt/lists/*
 
 
-# Install apt binary dependencies for exractors
-RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \
-    echo "[+] APT Installing extractor dependencies for $TARGETPLATFORM..." \
-    && apt-get update -qq \
-    && apt-get install -qq -y -t bookworm-backports \
-        curl wget git ffmpeg ripgrep pipx \
-        # Packages we have also needed in the past:
-        # youtube-dl wget2 aria2 python3-pyxattr rtmpdump libfribidi-bin mpv \
-    && rm -rf /var/lib/apt/lists/* \
-    # Save version info
-    && ( \
-        which curl && curl --version | head -n1 \
-        && which wget && wget --version 2>&1 | head -n1 \
-        && which git && git --version 2>&1 | head -n1 \
-        && which rg && rg --version 2>&1 | head -n1 \
-        && echo -e '\n\n' \
-    ) | tee -a /VERSION.txt
-
 # Install apt font & rendering dependencies for chromium browser
+# TODO: figure out how much of this overlaps with `playwright install-deps chromium`
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \
     echo "[+] APT Installing CHROMIUM dependencies, fonts, and display libraries for $TARGETPLATFORM..." \
     && apt-get update -qq \
-    && apt-get install -qq -y -t bookworm-backports \
+    && apt-get install -qq -y \
         fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-khmeros fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
         at-spi2-common fonts-liberation fonts-noto-color-emoji fonts-tlwg-loma-otf fonts-unifont libatk-bridge2.0-0 libatk1.0-0 libatspi2.0-0 libavahi-client3 \
         libavahi-common-data libavahi-common3 libcups2 libfontenc1 libice6 libnspr4 libnss3 libsm6 libunwind8 \
@@ -285,17 +281,20 @@ RUN --mount=type=cache,target=/root/.cache/ms-playwright,sharing=locked,id=brows
     --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \
     echo "[+] PIP Installing playwright into /venv and CHROMIUM binary into $PLAYWRIGHT_BROWSERS_PATH..." \
     && uv pip install "playwright>=1.49.1" \
-    && uv run playwright install chromium --with-deps \
+    && uv run playwright install chromium --no-shell \  
+    # --with-deps \
     && export CHROME_BINARY="$(uv run python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')" \
     && ln -s "$CHROME_BINARY" /usr/bin/chromium-browser \
     && mkdir -p "/home/${ARCHIVEBOX_USER}/.config/chromium/Crash Reports/pending/" \
     && chown -R "$DEFAULT_PUID:$DEFAULT_PGID" "/home/${ARCHIVEBOX_USER}/.config" \
     && mkdir -p "$PLAYWRIGHT_BROWSERS_PATH" \
     && chown -R $ARCHIVEBOX_USER "$PLAYWRIGHT_BROWSERS_PATH" \
+    # delete extra full copy of node that playwright installs (saves >100mb)
+    && rm -f /venv/lib/python$PYTHON_VERSION/site-packages/playwright/driver/node \
     # Save version info
     && ( \
         uv pip show playwright \
-        && uv run playwright --version \
+        # && uv run playwright --version \
         && which chromium-browser && /usr/bin/chromium-browser --version || /usr/lib/chromium/chromium --version \
         && echo -e '\n\n' \
     ) | tee -a /VERSION.txt
@@ -304,15 +303,16 @@ RUN --mount=type=cache,target=/root/.cache/ms-playwright,sharing=locked,id=brows
 ENV PATH="/home/$ARCHIVEBOX_USER/.npm/bin:$PATH"
 USER $ARCHIVEBOX_USER
 WORKDIR "/home/$ARCHIVEBOX_USER/.npm"
-RUN --mount=type=cache,target=/home/$ARCHIVEBOX_USER/.npm_cache,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT,uid=$DEFAULT_PUID,gid=$DEFAULT_PGID \
-    echo "[+] NPM Installing extractor dependencies into /home/$ARCHIVEBOX_USER/.npm..." \
+RUN --mount=type=cache,target=/home/archivebox/.npm_cache,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT,uid=$DEFAULT_PUID,gid=$DEFAULT_PGID \
+    echo "[+] NPM Installing node extractor dependencies into /home/$ARCHIVEBOX_USER/.npm..." \
     && npm config set prefix "/home/$ARCHIVEBOX_USER/.npm" \
     && npm install --global --prefer-offline --no-fund --no-audit --cache "/home/$ARCHIVEBOX_USER/.npm_cache" \
         "@postlight/parser@^2.2.3" \
         "readability-extractor@github:ArchiveBox/readability-extractor" \
         "single-file-cli@^1.1.54" \
         "puppeteer@^23.5.0" \
-        "@puppeteer/browsers@^2.4.0"
+        "@puppeteer/browsers@^2.4.0" \
+    && rm -Rf "/home/$ARCHIVEBOX_USER/.cache/puppeteer"
 USER root
 WORKDIR "$CODE_DIR"
 RUN ( \
@@ -328,13 +328,14 @@ RUN ( \
 ######### Build Dependencies ####################################
 
 
-
 # Install ArchiveBox Python venv dependencies from uv.lock
-COPY --chown=root:root --chmod=755 "pyproject.toml" "uv.lock" "$CODE_DIR"/
-RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \
+RUN --mount=type=bind,source=pyproject.toml,target=/app/pyproject.toml \
+    --mount=type=bind,source=uv.lock,target=/app/uv.lock \
+    --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \
     echo "[+] PIP Installing ArchiveBox dependencies from pyproject.toml and uv.lock..." \
     && uv sync \
         --frozen \
+        --inexact \
         --all-extras \
         --no-install-project \
         --no-install-workspace
@@ -345,8 +346,9 @@ COPY --chown=root:root --chmod=755 "." "$CODE_DIR/"
 RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \
     echo "[*] Installing ArchiveBox Python source code from $CODE_DIR..." \
     && uv sync \
-        --all-extras \
         --frozen \
+        --inexact \
+        --all-extras \
     && ( \
         uv tree \
         && which archivebox \

+ 47 - 0
bin/docker_layers.sh

@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+
+# This script takes a single Docker image tag (e.g. "ubuntu:latest") as input
+# and shows the contents of the filesystem for each layer in the image.
+
+if [ $# -ne 1 ]; then
+    echo "Usage: $0 <image_tag>"
+    exit 1
+fi
+
+IMAGE=$1
+# TMPDIR=$(mktemp -d) 
+mkdir -p "$PWD/tmp"
+TMPDIR="$PWD/tmp"
+
+# Save the Docker image to a tar archive
+echo "Saving Docker image '$IMAGE'..."
+if ! docker save "$IMAGE" | pv > "${TMPDIR}/image.tar"; then
+    echo "Failed to save image '$IMAGE'. Make sure the image exists and Docker is running."
+    rm -rf "${TMPDIR}"
+    exit 1
+fi
+
+cd "${TMPDIR}" || exit 1
+
+# Extract the top-level metadata of the image tar
+echo "Extracting image metadata..."
+pwd
+tar -xzf image.tar
+chmod -R 777 .
+cd blobs/sha256 || exit 1
+
+# Typically, the saved image will contain multiple directories each representing a layer.
+# Each layer directory should have a 'layer.tar' file that contains the filesystem for that layer.
+for LAYERFILE in ./*; do
+    if [ -f "${LAYERFILE}" ]; then
+        mv "${LAYERFILE}" "${LAYERFILE}.tar"
+        tar -xzf "${LAYERFILE}.tar"
+        rm "${LAYERFILE}.tar"
+        echo "-----------------------------------------------------------------"
+        echo "Contents of layer: ${LAYERFILE%/}"
+        echo "-----------------------------------------------------------------"
+        # List the files in the layer.tar without extracting
+        tree -L 2
+        echo
+    fi
+done