Dockerfile 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333
  1. # This is the Dockerfile for ArchiveBox, it bundles the following dependencies:
  2. # python3, ArchiveBox, curl, wget, git, chromium, youtube-dl, yt-dlp, single-file
  3. # Usage:
  4. # git submodule update --init --recursive
  5. # git pull --recurse-submodules
  6. # docker build . -t archivebox --no-cache
  7. # docker run -v "$PWD/data":/data archivebox init
  8. # docker run -v "$PWD/data":/data archivebox add 'https://example.com'
  9. # docker run -v "$PWD/data":/data -it archivebox manage createsuperuser
  10. # docker run -v "$PWD/data":/data -p 8000:8000 archivebox server
  11. # Multi-arch build:
  12. # docker buildx create --use
  13. # docker buildx build . --platform=linux/amd64,linux/arm64--push -t archivebox/archivebox:0.7.5 -t archivebox/archivebox:dev
  14. #
  15. # Read more about [developing Archivebox](https://github.com/ArchiveBox/ArchiveBox#archivebox-development).
  16. #########################################################################################
  17. # Use Debian 12 w/ faster package updates: https://packages.debian.org/bookworm-backports/
  18. FROM python:3.11-slim-bookworm
  19. # FROM debian:bookworm-backports
  20. LABEL name="archivebox" \
  21. maintainer="Nick Sweeting <[email protected]>" \
  22. description="All-in-one self-hosted internet archiving solution" \
  23. homepage="https://github.com/ArchiveBox/ArchiveBox" \
  24. documentation="https://github.com/ArchiveBox/ArchiveBox/wiki/Docker" \
  25. org.opencontainers.image.title="ArchiveBox" \
  26. org.opencontainers.image.vendor="ArchiveBox" \
  27. org.opencontainers.image.description="All-in-one self-hosted internet archiving solution" \
  28. org.opencontainers.image.source="https://github.com/ArchiveBox/ArchiveBox" \
  29. com.docker.image.source.entrypoint="Dockerfile" \
  30. # TODO: release ArchiveBox as a Docker Desktop extension (requires these labels):
  31. # https://docs.docker.com/desktop/extensions-sdk/architecture/metadata/
  32. com.docker.desktop.extension.api.version=">= 1.4.7" \
  33. com.docker.desktop.extension.icon="https://archivebox.io/icon.png" \
  34. com.docker.extension.publisher-url="https://archivebox.io" \
  35. com.docker.extension.screenshots='[{"alt": "Screenshot of Admin UI", "url": "https://github.com/ArchiveBox/ArchiveBox/assets/511499/e8e0b6f8-8fdf-4b7f-8124-c10d8699bdb2"}]' \
  36. com.docker.extension.detailed-description='See here for detailed documentation: https://wiki.archivebox.io' \
  37. com.docker.extension.changelog='See here for release notes: https://github.com/ArchiveBox/ArchiveBox/releases' \
  38. com.docker.extension.categories='database,utility-tools'
  39. ARG TARGETPLATFORM
  40. ARG TARGETOS
  41. ARG TARGETARCH
  42. ARG TARGETVARIANT
  43. ######### Environment Variables #################################
  44. # Global system-level config
  45. ENV TZ=UTC \
  46. LANGUAGE=en_US:en \
  47. LC_ALL=C.UTF-8 \
  48. LANG=C.UTF-8 \
  49. DEBIAN_FRONTEND=noninteractive \
  50. APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 \
  51. PYTHONIOENCODING=UTF-8 \
  52. PYTHONUNBUFFERED=1 \
  53. PIP_DISABLE_PIP_VERSION_CHECK=1 \
  54. npm_config_loglevel=error
  55. # Version config
  56. ENV PYTHON_VERSION=3.11 \
  57. NODE_VERSION=22
  58. # User config
  59. ENV ARCHIVEBOX_USER="archivebox" \
  60. DEFAULT_PUID=911 \
  61. DEFAULT_PGID=911 \
  62. IN_DOCKER=True
  63. # Global paths
  64. ENV CODE_DIR=/app \
  65. DATA_DIR=/data \
  66. PLAYWRIGHT_BROWSERS_PATH=/browsers
  67. # GLOBAL_VENV=/venv \
  68. # TODO: add TMP_DIR and LIB_DIR?
  69. # Build shell config
  70. SHELL ["/bin/bash", "-o", "pipefail", "-o", "errexit", "-o", "errtrace", "-o", "nounset", "-c"]
  71. ######### System Environment ####################################
  72. # Detect ArchiveBox version number by reading pyproject.toml
  73. COPY --chown=root:root --chmod=755 pyproject.toml "$CODE_DIR/"
  74. RUN grep '^version = ' "${CODE_DIR}/pyproject.toml" | awk -F'"' '{print $2}' > /VERSION.txt
  75. # Force apt to leave downloaded binaries in /var/cache/apt (massively speeds up Docker builds)
  76. RUN echo 'Binary::apt::APT::Keep-Downloaded-Packages "1";' > /etc/apt/apt.conf.d/99keep-cache \
  77. && echo 'APT::Install-Recommends "0";' > /etc/apt/apt.conf.d/99no-intall-recommends \
  78. && echo 'APT::Install-Suggests "0";' > /etc/apt/apt.conf.d/99no-intall-suggests \
  79. && rm -f /etc/apt/apt.conf.d/docker-clean
  80. # Print debug info about build and save it to disk, for human eyes only, not used by anything else
  81. RUN (echo "[i] Docker build for ArchiveBox $(cat /VERSION.txt) starting..." \
  82. && echo "PLATFORM=${TARGETPLATFORM} ARCH=$(uname -m) ($(uname -s) ${TARGETARCH} ${TARGETVARIANT})" \
  83. && echo "BUILD_START_TIME=$(date +"%Y-%m-%d %H:%M:%S %s") TZ=${TZ} LANG=${LANG}" \
  84. && echo \
  85. && echo "PYTHON=${PYTHON_VERSION} NODE=${NODE_VERSION} PATH=${PATH}" \
  86. && echo "CODE_DIR=${CODE_DIR} DATA_DIR=${DATA_DIR}" \
  87. && echo \
  88. && uname -a \
  89. && cat /etc/os-release | head -n7 \
  90. && which bash && bash --version | head -n1 \
  91. && which dpkg && dpkg --version | head -n1 \
  92. && echo -e '\n\n' && env && echo -e '\n\n' \
  93. ) | tee -a /VERSION.txt
  94. # Create non-privileged user for archivebox and chrome
  95. RUN echo "[*] Setting up $ARCHIVEBOX_USER user uid=${DEFAULT_PUID}..." \
  96. && groupadd --system $ARCHIVEBOX_USER \
  97. && useradd --system --create-home --gid $ARCHIVEBOX_USER --groups audio,video $ARCHIVEBOX_USER \
  98. && usermod -u "$DEFAULT_PUID" "$ARCHIVEBOX_USER" \
  99. && groupmod -g "$DEFAULT_PGID" "$ARCHIVEBOX_USER" \
  100. && echo -e "\nARCHIVEBOX_USER=$ARCHIVEBOX_USER PUID=$(id -u $ARCHIVEBOX_USER) PGID=$(id -g $ARCHIVEBOX_USER)\n\n" \
  101. | tee -a /VERSION.txt
  102. # DEFAULT_PUID and DEFAULT_PID are overriden by PUID and PGID in /bin/docker_entrypoint.sh at runtime
  103. # https://docs.linuxserver.io/general/understanding-puid-and-pgid
  104. # Install system apt dependencies (adding backports to access more recent apt updates)
  105. RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \
  106. echo "[+] Installing APT base system dependencies for $TARGETPLATFORM..." \
  107. && echo 'deb https://deb.debian.org/debian bookworm-backports main contrib non-free' > /etc/apt/sources.list.d/backports.list \
  108. && mkdir -p /etc/apt/keyrings \
  109. && apt-get update -qq \
  110. && apt-get install -qq -y -t bookworm-backports \
  111. # 1. packaging dependencies
  112. apt-transport-https ca-certificates apt-utils gnupg2 curl wget \
  113. # 2. docker and init system dependencies
  114. zlib1g-dev dumb-init gosu cron unzip grep dnsutils \
  115. # 3. frivolous CLI helpers to make debugging failed archiving easier
  116. tree nano iputils-ping \
  117. # nano iputils-ping dnsutils htop procps jq yq
  118. && rm -rf /var/lib/apt/lists/*
  119. # Install sonic search backend
  120. COPY --from=archivebox/sonic:1.4.9 /usr/local/bin/sonic /usr/local/bin/sonic
  121. COPY --chown=root:root --chmod=755 "etc/sonic.cfg" /etc/sonic.cfg
  122. RUN (which sonic && sonic --version) | tee -a /VERSION.txt
  123. ######### Language Environments ####################################
  124. # Install Python environment
  125. RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
  126. echo "[+] Setting up Python $PYTHON_VERSION runtime..." \
  127. # NOT NEEDED because we're using a pre-built python image, keeping this here in case we switch back to custom-building our own:
  128. # && apt-get update -qq \
  129. # && apt-get install -qq -y -t bookworm-backports --no-upgrade \
  130. # python${PYTHON_VERSION} python${PYTHON_VERSION}-minimal python3-pip python${PYTHON_VERSION}-venv pipx \
  131. # && rm -rf /var/lib/apt/lists/* \
  132. # tell PDM to allow using global system python site packages
  133. # && rm /usr/lib/python3*/EXTERNALLY-MANAGED \
  134. # && ln -s "$(which python${PYTHON_VERSION})" /usr/bin/python \
  135. # create global virtual environment GLOBAL_VENV to use (better than using pip install --global)
  136. # && python3 -m venv --system-site-packages --symlinks $GLOBAL_VENV \
  137. # && python3 -m venv --system-site-packages $GLOBAL_VENV \
  138. # && python3 -m venv $GLOBAL_VENV \
  139. # install global dependencies / python build dependencies in GLOBAL_VENV
  140. # && pip install --upgrade pip setuptools wheel \
  141. # Save version info
  142. && ( \
  143. which python3 && python3 --version | grep " $PYTHON_VERSION" \
  144. && which pip && pip --version \
  145. # && which pdm && pdm --version \
  146. && echo -e '\n\n' \
  147. ) | tee -a /VERSION.txt
  148. # Install Node environment
  149. RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.npm,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT \
  150. echo "[+] Installing Node $NODE_VERSION environment..." \
  151. && echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_${NODE_VERSION}.x nodistro main" >> /etc/apt/sources.list.d/nodejs.list \
  152. && curl -fsSL "https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key" | gpg --dearmor | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
  153. && apt-get update -qq \
  154. && apt-get install -qq -y -t bookworm-backports --no-upgrade libatomic1 \
  155. && apt-get install -y -t bookworm-backports --no-upgrade \
  156. nodejs \
  157. && rm -rf /var/lib/apt/lists/* \
  158. # Update NPM to latest version
  159. && npm i -g npm --cache /root/.npm \
  160. # Save version info
  161. && ( \
  162. which node && node --version \
  163. && which npm && npm --version \
  164. && echo -e '\n\n' \
  165. ) | tee -a /VERSION.txt
  166. ######### Extractor Dependencies ##################################
  167. # Install apt dependencies
  168. RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
  169. echo "[+] Installing APT extractor dependencies globally using apt..." \
  170. && apt-get update -qq \
  171. && apt-get install -qq -y -t bookworm-backports \
  172. curl wget git ffmpeg ripgrep \
  173. # Packages we have also needed in the past:
  174. # youtube-dl wget2 aria2 python3-pyxattr rtmpdump libfribidi-bin mpv \
  175. && rm -rf /var/lib/apt/lists/* \
  176. # Save version info
  177. && ( \
  178. which curl && curl --version | head -n1 \
  179. && which wget && wget --version 2>&1 | head -n1 \
  180. && which git && git --version 2>&1 | head -n1 \
  181. && which rg && rg --version 2>&1 | head -n1 \
  182. && echo -e '\n\n' \
  183. ) | tee -a /VERSION.txt
  184. # Install chromium browser using playwright
  185. RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/ms-playwright,sharing=locked,id=browsers-$TARGETARCH$TARGETVARIANT \
  186. echo "[+] Installing Browser binary dependencies to $PLAYWRIGHT_BROWSERS_PATH..." \
  187. && apt-get update -qq \
  188. && apt-get install -qq -y -t bookworm-backports \
  189. fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-khmeros fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
  190. at-spi2-common fonts-liberation fonts-noto-color-emoji fonts-tlwg-loma-otf fonts-unifont libatk-bridge2.0-0 libatk1.0-0 libatspi2.0-0 libavahi-client3 \
  191. libavahi-common-data libavahi-common3 libcups2 libfontenc1 libice6 libnspr4 libnss3 libsm6 libunwind8 \
  192. libxaw7 libxcomposite1 libxdamage1 libxfont2 \
  193. libxkbfile1 libxmu6 libxpm4 libxt6 x11-xkb-utils x11-utils xfonts-encodings \
  194. # xfonts-scalable xfonts-utils xserver-common xvfb \
  195. # chrome can run without dbus/upower technically, it complains about missing dbus but should run ok anyway
  196. # libxss1 dbus dbus-x11 upower \
  197. # && service dbus start \
  198. # install Chromium using playwright
  199. && pip install playwright \
  200. && cp -r /root/.cache/ms-playwright "$PLAYWRIGHT_BROWSERS_PATH" \
  201. && playwright install chromium \
  202. && export CHROME_BINARY="$(python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')" \
  203. && rm -rf /var/lib/apt/lists/* \
  204. && ln -s "$CHROME_BINARY" /usr/bin/chromium-browser \
  205. && mkdir -p "/home/${ARCHIVEBOX_USER}/.config/chromium/Crash Reports/pending/" \
  206. && chown -R "$DEFAULT_PUID:$DEFAULT_PGID" "/home/${ARCHIVEBOX_USER}/.config" \
  207. && mkdir -p "$PLAYWRIGHT_BROWSERS_PATH" \
  208. && chown -R $ARCHIVEBOX_USER "$PLAYWRIGHT_BROWSERS_PATH" \
  209. # Save version info
  210. && ( \
  211. which chromium-browser && /usr/bin/chromium-browser --version || /usr/lib/chromium/chromium --version \
  212. && echo -e '\n\n' \
  213. ) | tee -a /VERSION.txt
  214. # Install Node dependencies
  215. ENV PATH="/home/$ARCHIVEBOX_USER/.npm/bin:$PATH"
  216. USER $ARCHIVEBOX_USER
  217. WORKDIR "/home/$ARCHIVEBOX_USER/.npm"
  218. RUN --mount=type=cache,target=/home/$ARCHIVEBOX_USER/.npm_cache,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT,uid=$DEFAULT_PUID,gid=$DEFAULT_PGID \
  219. echo "[+] Installing NPM extractor dependencies in /home/$ARCHIVEBOX_USER/.npm..." \
  220. && npm config set prefix "/home/$ARCHIVEBOX_USER/.npm" \
  221. && npm install --global --prefer-offline --no-fund --no-audit --cache "/home/$ARCHIVEBOX_USER/.npm_cache" \
  222. "@postlight/parser@^2.2.3" \
  223. "readability-extractor@github:ArchiveBox/readability-extractor" \
  224. "single-file-cli@^1.1.54" \
  225. "puppeteer@^23.5.0" \
  226. "@puppeteer/browsers@^2.4.0"
  227. USER root
  228. RUN ( \
  229. which node && node --version \
  230. && which npm && npm version \
  231. && which postlight-parser \
  232. && which readability-extractor && readability-extractor --version \
  233. && which single-file && single-file --version \
  234. && which puppeteer && puppeteer --version \
  235. && echo -e '\n\n' \
  236. ) | tee -a /VERSION.txt
  237. ######### Build Dependencies ####################################
  238. # Install ArchiveBox Python dependencies
  239. WORKDIR "$CODE_DIR"
  240. COPY --chown=root:root --chmod=755 "./pyproject.toml" "requirements.txt" "$CODE_DIR"/
  241. RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
  242. echo "[+] Installing PIP ArchiveBox dependencies from requirements.txt for ${TARGETPLATFORM}..." \
  243. && apt-get update -qq \
  244. && apt-get install -qq -y -t bookworm-backports \
  245. build-essential gcc \
  246. libssl-dev libldap2-dev libsasl2-dev \
  247. python3-ldap python3-msgpack python3-mutagen python3-regex python3-pycryptodome procps \
  248. pipx \
  249. # && ln -s "$GLOBAL_VENV" "$APP_VENV" \
  250. # && pdm use --venv in-project \
  251. # && pdm run python -m ensurepip \
  252. # && pdm sync --fail-fast --no-editable --group :all --no-self \
  253. # && pdm export -o requirements.txt --without-hashes \
  254. # && source $GLOBAL_VENV/bin/activate \
  255. && pip install -r requirements.txt \
  256. && apt-get purge -y \
  257. build-essential gcc \
  258. && apt-get autoremove -y \
  259. && rm -rf /var/lib/apt/lists/*
  260. # Install ArchiveBox Python package from source
  261. COPY --chown=root:root --chmod=755 "." "$CODE_DIR/"
  262. RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
  263. echo "[*] Installing PIP ArchiveBox package from $CODE_DIR..." \
  264. && pip install -e "${CODE_DIR}[all]" \
  265. && rm -rf /var/lib/apt/lists/*
  266. ####################################################
  267. # Setup ArchiveBox runtime config
  268. WORKDIR "$DATA_DIR"
  269. RUN openssl rand -hex 16 > /etc/machine-id \
  270. && mkdir -p "/tmp/archivebox" \
  271. && chown -R "$DEFAULT_PUID:$DEFAULT_PGID" "/tmp/archivebox" \
  272. && mkdir -p "/usr/share/archivebox/lib" \
  273. && chown -R "$DEFAULT_PUID:$DEFAULT_PGID" "/usr/share/archivebox/lib"
  274. ENV GOOGLE_API_KEY=no \
  275. GOOGLE_DEFAULT_CLIENT_ID=no \
  276. GOOGLE_DEFAULT_CLIENT_SECRET=no \
  277. TMP_DIR=/tmp/archivebox \
  278. LIB_DIR=/usr/share/archivebox/lib \
  279. ALLOWED_HOSTS=*
  280. # Print version for nice docker finish summary
  281. RUN (echo -e "\n\n[√] Finished Docker build succesfully. Saving build summary in: /VERSION.txt" \
  282. && echo -e "PLATFORM=${TARGETPLATFORM} ARCH=$(uname -m) ($(uname -s) ${TARGETARCH} ${TARGETVARIANT})\n" \
  283. && echo -e "BUILD_END_TIME=$(date +"%Y-%m-%d %H:%M:%S %s")\n\n" \
  284. ) | tee -a /VERSION.txt
  285. RUN "$CODE_DIR"/bin/docker_entrypoint.sh version 2>&1 | tee -a /VERSION.txt
  286. ####################################################
  287. # Open up the interfaces to the outside world
  288. WORKDIR "$DATA_DIR"
  289. VOLUME "$DATA_DIR"
  290. EXPOSE 8000
  291. HEALTHCHECK --interval=30s --timeout=20s --retries=15 \
  292. CMD curl --silent 'http://localhost:8000/health/' | grep -q 'OK'
  293. ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh"]
  294. CMD ["archivebox", "server", "--quick-init", "0.0.0.0:8000"]