Dockerfile 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. # This is the Dockerfile for ArchiveBox, it bundles the following dependencies:
  2. # python3, ArchiveBox, curl, wget, git, chromium, youtube-dl, yt-dlp, single-file
  3. # Usage:
  4. # git submodule update --init --recursive
  5. # git pull --recurse-submodules
  6. # docker build . -t archivebox --no-cache
  7. # docker run -v "$PWD/data":/data archivebox init
  8. # docker run -v "$PWD/data":/data archivebox add 'https://example.com'
  9. # docker run -v "$PWD/data":/data -it archivebox manage createsuperuser
  10. # docker run -v "$PWD/data":/data -p 8000:8000 archivebox server
  11. # Multi-arch build:
  12. # docker buildx create --use
  13. # docker buildx build . --platform=linux/amd64,linux/arm64,linux/arm/v7 --push -t archivebox/archivebox:latest -t archivebox/archivebox:dev
  14. #
  15. # Read more about [developing Archivebox](https://github.com/ArchiveBox/ArchiveBox#archivebox-development).
  16. # Use Debian 12 w/ faster package updates: https://packages.debian.org/bookworm-backports/
  17. FROM python:3.11-slim-bookworm
  18. LABEL name="archivebox" \
  19. maintainer="Nick Sweeting <[email protected]>" \
  20. description="All-in-one personal internet archiving container" \
  21. homepage="https://github.com/ArchiveBox/ArchiveBox" \
  22. documentation="https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker"
  23. ARG TARGETPLATFORM
  24. ARG TARGETOS
  25. ARG TARGETARCH
  26. ARG TARGETVARIANT
  27. ######### Environment Variables #################################
  28. # Global system-level config
  29. ENV TZ=UTC \
  30. LANGUAGE=en_US:en \
  31. LC_ALL=C.UTF-8 \
  32. LANG=C.UTF-8 \
  33. DEBIAN_FRONTEND=noninteractive \
  34. APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 \
  35. PYTHONIOENCODING=UTF-8 \
  36. PYTHONUNBUFFERED=1 \
  37. PIP_DISABLE_PIP_VERSION_CHECK=1 \
  38. npm_config_loglevel=error
  39. # Version config
  40. ENV PYTHON_VERSION=3.11 \
  41. NODE_VERSION=20
  42. # User config
  43. ENV ARCHIVEBOX_USER="archivebox" \
  44. DEFAULT_PUID=911 \
  45. DEFAULT_PGID=911
  46. # Global paths
  47. ENV CODE_DIR=/app \
  48. DATA_DIR=/data \
  49. GLOBAL_VENV=/venv \
  50. PLAYWRIGHT_BROWSERS_PATH=/browsers
  51. # Application-level paths
  52. ENV APP_VENV=/app/.venv \
  53. NODE_MODULES=/app/node_modules
  54. # Build shell config
  55. ENV PATH="$PATH:$GLOBAL_VENV/bin:$APP_VENV/bin:$NODE_MODULES/.bin"
  56. SHELL ["/bin/bash", "-o", "pipefail", "-o", "errexit", "-o", "errtrace", "-o", "nounset", "-c"]
  57. ######### System Environment ####################################
  58. # Detect ArchiveBox version number by reading package.json
  59. COPY --chown=root:root --chmod=755 package.json "$CODE_DIR/"
  60. RUN grep '"version": ' "${CODE_DIR}/package.json" | awk -F'"' '{print $4}' > /VERSION.txt
  61. # Force apt to leave downloaded binaries in /var/cache/apt (massively speeds up Docker builds)
  62. RUN echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache \
  63. && rm -f /etc/apt/apt.conf.d/docker-clean
  64. # Print debug info about build and save it to disk, for human eyes only, not used by anything else
  65. RUN (echo "[i] Docker build for ArchiveBox $(cat /VERSION.txt) starting..." \
  66. && echo "PLATFORM=${TARGETPLATFORM} ARCH=$(uname -m) ($(uname -s) ${TARGETARCH} ${TARGETVARIANT})" \
  67. && echo "BUILD_START_TIME=$(date +"%Y-%m-%d %H:%M:%S %s") TZ=${TZ} LANG=${LANG}" \
  68. && echo \
  69. && echo "GLOBAL_VENV=${GLOBAL_VENV} APP_VENV=${APP_VENV} NODE_MODULES=${NODE_MODULES}" \
  70. && echo "PYTHON=${PYTHON_VERSION} NODE=${NODE_VERSION} PATH=${PATH}" \
  71. && echo "CODE_DIR=${CODE_DIR} DATA_DIR=${DATA_DIR}" \
  72. && echo \
  73. && uname -a \
  74. && cat /etc/os-release | head -n7 \
  75. && which bash && bash --version | head -n1 \
  76. && which dpkg && dpkg --version | head -n1 \
  77. && echo -e '\n\n' && env && echo -e '\n\n' \
  78. ) | tee -a /VERSION.txt
  79. # Create non-privileged user for archivebox and chrome
  80. RUN echo "[*] Setting up $ARCHIVEBOX_USER user uid=${DEFAULT_PUID}..." \
  81. && groupadd --system $ARCHIVEBOX_USER \
  82. && useradd --system --create-home --gid $ARCHIVEBOX_USER --groups audio,video $ARCHIVEBOX_USER \
  83. && usermod -u "$DEFAULT_PUID" "$ARCHIVEBOX_USER" \
  84. && groupmod -g "$DEFAULT_PGID" "$ARCHIVEBOX_USER" \
  85. && echo -e "\nARCHIVEBOX_USER=$ARCHIVEBOX_USER PUID=$(id -u $ARCHIVEBOX_USER) PGID=$(id -g $ARCHIVEBOX_USER)\n\n" \
  86. | tee -a /VERSION.txt
  87. # DEFAULT_PUID and DEFAULT_PID are overriden by PUID and PGID in /bin/docker_entrypoint.sh at runtime
  88. # https://docs.linuxserver.io/general/understanding-puid-and-pgid
  89. # Install system apt dependencies (adding backports to access more recent apt updates)
  90. RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \
  91. echo "[+] Installing APT base system dependencies for $TARGETPLATFORM..." \
  92. && echo 'deb https://deb.debian.org/debian bookworm-backports main contrib non-free' >> /etc/apt/sources.list.d/backports.list \
  93. && mkdir -p /etc/apt/keyrings \
  94. && apt-get update -qq \
  95. && apt-get install -qq -y -t bookworm-backports --no-install-recommends \
  96. # 1. packaging dependencies
  97. apt-transport-https ca-certificates apt-utils gnupg2 curl wget \
  98. # 2. docker and init system dependencies
  99. zlib1g-dev dumb-init gosu cron unzip grep \
  100. # 3. frivolous CLI helpers to make debugging failed archiving easier
  101. # nano iputils-ping dnsutils htop procps jq yq
  102. && rm -rf /var/lib/apt/lists/*
  103. ######### Language Environments ####################################
  104. # Install Node environment
  105. RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.npm,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT \
  106. echo "[+] Installing Node $NODE_VERSION environment in $NODE_MODULES..." \
  107. && echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_${NODE_VERSION}.x nodistro main" >> /etc/apt/sources.list.d/nodejs.list \
  108. && curl -fsSL "https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key" | gpg --dearmor | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
  109. && apt-get update -qq \
  110. && apt-get install -qq -y -t bookworm-backports --no-install-recommends \
  111. nodejs libatomic1 python3-minimal \
  112. && rm -rf /var/lib/apt/lists/* \
  113. # Update NPM to latest version
  114. && npm i -g npm --cache /root/.npm \
  115. # Save version info
  116. && ( \
  117. which node && node --version \
  118. && which npm && npm --version \
  119. && echo -e '\n\n' \
  120. ) | tee -a /VERSION.txt
  121. # Install Python environment
  122. RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
  123. echo "[+] Setting up Python $PYTHON_VERSION runtime..." \
  124. # tell PDM to allow using global system python site packages
  125. # && rm /usr/lib/python3*/EXTERNALLY-MANAGED \
  126. # create global virtual environment GLOBAL_VENV to use (better than using pip install --global)
  127. # && python3 -m venv --system-site-packages --symlinks $GLOBAL_VENV \
  128. # && python3 -m venv --system-site-packages $GLOBAL_VENV \
  129. # && python3 -m venv $GLOBAL_VENV \
  130. # install global dependencies / python build dependencies in GLOBAL_VENV
  131. # && pip install --upgrade pip setuptools wheel \
  132. # Save version info
  133. && ( \
  134. which python3 && python3 --version | grep " $PYTHON_VERSION" \
  135. && which pip && pip --version \
  136. # && which pdm && pdm --version \
  137. && echo -e '\n\n' \
  138. ) | tee -a /VERSION.txt
  139. ######### Extractor Dependencies ##################################
  140. # Install apt dependencies
  141. RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
  142. echo "[+] Installing APT extractor dependencies globally using apt..." \
  143. && apt-get update -qq \
  144. && apt-get install -qq -y -t bookworm-backports --no-install-recommends \
  145. curl wget git yt-dlp ffmpeg ripgrep \
  146. # Packages we have also needed in the past:
  147. # youtube-dl wget2 aria2 python3-pyxattr rtmpdump libfribidi-bin mpv \
  148. # fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
  149. && rm -rf /var/lib/apt/lists/* \
  150. # Save version info
  151. && ( \
  152. which curl && curl --version | head -n1 \
  153. && which wget && wget --version 2>&1 | head -n1 \
  154. && which yt-dlp && yt-dlp --version 2>&1 | head -n1 \
  155. && which git && git --version 2>&1 | head -n1 \
  156. && which rg && rg --version 2>&1 | head -n1 \
  157. && echo -e '\n\n' \
  158. ) | tee -a /VERSION.txt
  159. # Install chromium browser using playwright
  160. RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/ms-playwright,sharing=locked,id=browsers-$TARGETARCH$TARGETVARIANT \
  161. echo "[+] Installing Browser binary dependencies to $PLAYWRIGHT_BROWSERS_PATH..." \
  162. && apt-get update -qq \
  163. && if [[ "$TARGETPLATFORM" == *amd64* || "$TARGETPLATFORM" == *arm64* ]]; then \
  164. # install Chromium using playwright
  165. pip install playwright \
  166. && cp -r /root/.cache/ms-playwright "$PLAYWRIGHT_BROWSERS_PATH" \
  167. && playwright install --with-deps chromium \
  168. && export CHROME_BINARY="$(python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')"; \
  169. else \
  170. # fall back to installing Chromium via apt-get on platforms not supported by playwright (e.g. risc, ARMv7, etc.)
  171. apt-get install -qq -y -t bookworm-backports --no-install-recommends \
  172. chromium fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
  173. && export CHROME_BINARY="$(which chromium)"; \
  174. fi \
  175. && rm -rf /var/lib/apt/lists/* \
  176. && ln -s "$CHROME_BINARY" /usr/bin/chromium-browser \
  177. && mkdir -p "/home/${ARCHIVEBOX_USER}/.config/chromium/Crash Reports/pending/" \
  178. && chown -R $ARCHIVEBOX_USER "/home/${ARCHIVEBOX_USER}/.config" \
  179. && mkdir -p "$PLAYWRIGHT_BROWSERS_PATH" \
  180. && chown -R $ARCHIVEBOX_USER "$PLAYWRIGHT_BROWSERS_PATH" \
  181. # Save version info
  182. && ( \
  183. which chromium-browser && /usr/bin/chromium-browser --version || /usr/lib/chromium/chromium --version \
  184. && echo -e '\n\n' \
  185. ) | tee -a /VERSION.txt
  186. # Install Node dependencies
  187. WORKDIR "$CODE_DIR"
  188. COPY --chown=root:root --chmod=755 "package.json" "package-lock.json" "$CODE_DIR"/
  189. RUN --mount=type=cache,target=/root/.npm,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT \
  190. echo "[+] Installing NPM extractor dependencies from package.json into $NODE_MODULES..." \
  191. && npm ci --prefer-offline --no-audit --cache /root/.npm \
  192. && ( \
  193. which node && node --version \
  194. && which npm && npm version \
  195. && echo -e '\n\n' \
  196. ) | tee -a /VERSION.txt
  197. ######### Build Dependencies ####################################
  198. # Install ArchiveBox Python dependencies
  199. WORKDIR "$CODE_DIR"
  200. COPY --chown=root:root --chmod=755 "./pyproject.toml" "requirements.txt" "$CODE_DIR"/
  201. RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
  202. echo "[+] Installing PIP ArchiveBox dependencies from requirements.txt for ${TARGETPLATFORM}..." \
  203. && apt-get update -qq \
  204. && apt-get install -qq -y -t bookworm-backports --no-install-recommends \
  205. build-essential \
  206. libssl-dev libldap2-dev libsasl2-dev \
  207. python3-ldap python3-msgpack python3-mutagen python3-regex python3-pycryptodome procps \
  208. # && ln -s "$GLOBAL_VENV" "$APP_VENV" \
  209. # && pdm use --venv in-project \
  210. # && pdm run python -m ensurepip \
  211. # && pdm sync --fail-fast --no-editable --group :all --no-self \
  212. # && pdm export -o requirements.txt --without-hashes \
  213. # && source $GLOBAL_VENV/bin/activate \
  214. && pip install -r requirements.txt \
  215. && apt-get purge -y \
  216. build-essential \
  217. && apt-get autoremove -y \
  218. && rm -rf /var/lib/apt/lists/*
  219. # Install ArchiveBox Python package from source
  220. COPY --chown=root:root --chmod=755 "." "$CODE_DIR/"
  221. RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
  222. echo "[*] Installing PIP ArchiveBox package from $CODE_DIR..." \
  223. # && apt-get update -qq \
  224. # install C compiler to build deps on platforms that dont have 32-bit wheels available on pypi
  225. # && apt-get install -qq -y -t bookworm-backports --no-install-recommends \
  226. # build-essential \
  227. # INSTALL ARCHIVEBOX python package globally from CODE_DIR, with all optional dependencies
  228. && pip install -e "$CODE_DIR"[sonic,ldap] \
  229. # save docker image size and always remove compilers / build tools after building is complete
  230. # && apt-get purge -y build-essential \
  231. # && apt-get autoremove -y \
  232. && rm -rf /var/lib/apt/lists/*
  233. ####################################################
  234. # Setup ArchiveBox runtime config
  235. WORKDIR "$DATA_DIR"
  236. ENV IN_DOCKER=True
  237. ## No need to set explicitly, these values will be autodetected by archivebox in docker:
  238. # CHROME_SANDBOX=False \
  239. # WGET_BINARY="wget" \
  240. # YOUTUBEDL_BINARY="yt-dlp" \
  241. # CHROME_BINARY="/usr/bin/chromium-browser" \
  242. # USE_SINGLEFILE=True \
  243. # SINGLEFILE_BINARY="$NODE_MODULES/.bin/single-file" \
  244. # USE_READABILITY=True \
  245. # READABILITY_BINARY="$NODE_MODULES/.bin/readability-extractor" \
  246. # USE_MERCURY=True \
  247. # MERCURY_BINARY="$NODE_MODULES/.bin/postlight-parser"
  248. # Print version for nice docker finish summary
  249. RUN (echo -e "\n\n[√] Finished Docker build succesfully. Saving build summary in: /VERSION.txt" \
  250. && echo -e "PLATFORM=${TARGETPLATFORM} ARCH=$(uname -m) ($(uname -s) ${TARGETARCH} ${TARGETVARIANT})\n" \
  251. && echo -e "BUILD_END_TIME=$(date +"%Y-%m-%d %H:%M:%S %s")\n\n" \
  252. ) | tee -a /VERSION.txt
  253. RUN "$CODE_DIR"/bin/docker_entrypoint.sh version 2>&1 | tee -a /VERSION.txt
  254. ####################################################
  255. # Open up the interfaces to the outside world
  256. WORKDIR "$DATA_DIR"
  257. VOLUME "$DATA_DIR"
  258. EXPOSE 8000
  259. # Optional:
  260. # HEALTHCHECK --interval=30s --timeout=20s --retries=15 \
  261. # CMD curl --silent 'http://localhost:8000/admin/login/' || exit 1
  262. ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh"]
  263. CMD ["archivebox", "server", "--quick-init", "0.0.0.0:8000"]