Browse Source

merge fixes

Nick Sweeting 1 year ago
parent
commit
ac73fb5129
6 changed files with 40 additions and 41 deletions
  1. 7 6
      Dockerfile
  2. 1 1
      README.md
  3. 1 1
      archivebox/index/__init__.py
  4. 26 28
      docker-compose.yml
  5. 1 1
      package.json
  6. 4 4
      pyproject.toml

+ 7 - 6
Dockerfile

@@ -10,7 +10,7 @@
 #     docker run -v "$PWD/data":/data -p 8000:8000 archivebox server
 # Multi-arch build:
 #     docker buildx create --use
-#     docker buildx build . --platform=linux/amd64,linux/arm64,linux/arm/v7 --push -t archivebox/archivebox:latest -t archivebox/archivebox:dev
+#     docker buildx build . --platform=linux/amd64,linux/arm64--push -t archivebox/archivebox:latest -t archivebox/archivebox:dev
 #
 # Read more about [developing Archivebox](https://github.com/ArchiveBox/ArchiveBox#archivebox-development).
 
@@ -194,10 +194,12 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
         && playwright install --with-deps chromium \
         && export CHROME_BINARY="$(python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')"; \
     else \
-        # fall back to installing Chromium via apt-get on platforms not supported by playwright (e.g. risc, ARMv7, etc.) 
-        apt-get install -qq -y -t bookworm-backports --no-install-recommends \
-            chromium \
-        && export CHROME_BINARY="$(which chromium)"; \
+        # fall back to installing Chromium via apt-get on platforms not supported by playwright (e.g. risc, ARMv7, etc.)
+        # apt-get install -qq -y -t bookworm-backports --no-install-recommends \
+        #     chromium \
+        # && export CHROME_BINARY="$(which chromium)"; \
+        echo 'armv7 no longer supported in versions after v0.7.3' \
+        exit 1; \
     fi \
     && rm -rf /var/lib/apt/lists/* \
     && ln -s "$CHROME_BINARY" /usr/bin/chromium-browser \
@@ -275,7 +277,6 @@ ENV IN_DOCKER=True \
     GOOGLE_DEFAULT_CLIENT_SECRET=no \
     ALLOWED_HOSTS=*
     ## No need to set explicitly, these values will be autodetected by archivebox in docker:
-    # CHROME_SANDBOX=False \
     # WGET_BINARY="wget" \
     # YOUTUBEDL_BINARY="yt-dlp" \
     # CHROME_BINARY="/usr/bin/chromium-browser" \

+ 1 - 1
README.md

@@ -1076,7 +1076,7 @@ Because ArchiveBox is designed to ingest a large volume of URLs with multiple co
 <li><strong>Don't store large collections on older filesystems like EXT3/FAT</strong> as they may not be able to handle more than 50k directory entries in the <code>data/archive/</code> folder.
 </li>
 <li><strong>Try to keep the <code>data/index.sqlite3</code> file on local drive (not a network mount)</strong> or SSD for maximum performance, however the <code>data/archive/</code> folder can be on a network mount or slower HDD.</li>
-<li>If using Docker or NFS/SMB/FUSE for the `data/archive/` folder, you may need to set <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid"><code>PUID</code> & <code>PGID</code></a> and <a href="https://github.com/ArchiveBox/ArchiveBox/issues/1304">disable <code>root_squash</code></a> on your fileshare server.
+<li>If using Docker or NFS/SMB/FUSE for the <code>data/archive/</code> folder, you may need to set <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid"><code>PUID</code> & <code>PGID</code></a> and <a href="https://github.com/ArchiveBox/ArchiveBox/issues/1304">disable <code>root_squash</code></a> on your fileshare server.
 </li>
 </ul>
 

+ 1 - 1
archivebox/index/__init__.py

@@ -250,7 +250,7 @@ def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
     """parse and load existing index with any new links from import_path merged in"""
     from core.models import Snapshot
     try:
-        return Snapshot.objects.all()
+        return Snapshot.objects.all().only('id')
 
     except (KeyboardInterrupt, SystemExit):
         raise SystemExit(0)

+ 26 - 28
docker-compose.yml

@@ -8,32 +8,26 @@
 # Documentation:
 #     https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker-compose
 
-version: '3.9'
 
 services:
     archivebox:
-        #image: ${DOCKER_IMAGE:-archivebox/archivebox:dev}
-        image: archivebox/archivebox:dev
-        command: server --quick-init 0.0.0.0:8000
+        image: archivebox/archivebox
         ports:
             - 8000:8000
         volumes:
             - ./data:/data
-            # - ./etc/crontabs:/var/spool/cron/crontabs  # uncomment this and archivebox_scheduler below to set up automatic recurring archive jobs
-            # - ./archivebox:/app/archivebox             # uncomment this to mount the ArchiveBox source code at runtime (for developers working on archivebox)
-        # build: .                                       # uncomment this to build the image from source code at buildtime (for developers working on archivebox)
         environment:
             - ALLOWED_HOSTS=*                   # restrict this to only accept incoming traffic via specific domain name
-            # - PUBLIC_INDEX=True               # set to False to prevent anonymous users from viewing snapshot list
-            # - PUBLIC_SNAPSHOTS=True           # set to False to prevent anonymous users from viewing snapshot content
-            # - PUBLIC_ADD_VIEW=False           # set to True to allow anonymous users to submit new URLs to archive
             # - ADMIN_USERNAME=admin            # create an admin user on first run with the given user/pass combo
             # - ADMIN_PASSWORD=SomeSecretPassword
             # - PUID=911                        # set to your host user's UID & GID if you encounter permissions issues
             # - PGID=911
-            # - SEARCH_BACKEND_ENGINE=sonic     # uncomment these and sonic container below for better full-text search
-            # - SEARCH_BACKEND_HOST_NAME=sonic
-            # - SEARCH_BACKEND_PASSWORD=SomeSecretPassword
+            # - PUBLIC_INDEX=True               # set to False to prevent anonymous users from viewing snapshot list
+            # - PUBLIC_SNAPSHOTS=True           # set to False to prevent anonymous users from viewing snapshot content
+            # - PUBLIC_ADD_VIEW=False           # set to True to allow anonymous users to submit new URLs to archive
+            - SEARCH_BACKEND_ENGINE=sonic     # uncomment these and sonic container below for better full-text search
+            - SEARCH_BACKEND_HOST_NAME=sonic
+            - SEARCH_BACKEND_PASSWORD=SomeSecretPassword
             # - MEDIA_MAX_SIZE=750m             # increase this filesize limit to allow archiving larger audio/video files
             # - TIMEOUT=60                      # increase this number to 120+ seconds if you see many slow downloads timing out
             # - CHECK_SSL_VALIDITY=True         # set to False to disable strict SSL checking (allows saving URLs w/ broken certs)
@@ -42,7 +36,7 @@ services:
             # add further configuration options from archivebox/config.py as needed (to apply them only to this container)
             # or set using `docker compose run archivebox config --set SOME_KEY=someval` (to persist config across all containers)
         
-        # For ad-blocking during archiving, uncomment this section and pihole service section below 
+        # For ad-blocking during archiving, uncomment this section and pihole service section below
         # networks:
         #   - dns
         # dns:
@@ -51,22 +45,26 @@ services:
 
     ######## Optional Addons: tweak examples below as needed for your specific use case ########
 
-    ### Example: To run the Sonic full-text search backend, first download the config file to sonic.cfg
-    #   $ curl -O https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/etc/sonic.cfg
-    # After starting, backfill any existing Snapshots into the full-text index:
+    ### Runs the Sonic full-text search backend, config file is auto-downloaded into sonic.cfg:
+    #   After starting, backfill any existing Snapshots into the full-text index:
     #   $ docker-compose run archivebox update --index-only
 
-    # sonic:
-    #    image: valeriansaliou/sonic:latest
-    #    expose:
-    #        - 1491
-    #    environment:
-    #        - SEARCH_BACKEND_PASSWORD=SomeSecretPassword
-    #    volumes:
-    #        - ./sonic.cfg:/etc/sonic.cfg:ro
-    #        - ./data/sonic:/var/lib/sonic/store
-    
-    
+    sonic:
+        image: valeriansaliou/sonic
+        build:
+            dockerfile_inline: |
+                FROM quay.io/curl/curl:latest AS setup
+                RUN curl -fsSL 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/main/etc/sonic.cfg' > /tmp/sonic.cfg
+                FROM valeriansaliou/sonic:latest
+                COPY --from=setup /tmp/sonic.cfg /etc/sonic.cfg
+        expose:
+            - 1491
+        environment:
+            - SEARCH_BACKEND_PASSWORD=SomeSecretPassword
+        volumes:
+            - ./etc/sonic.cfg:/etc/sonic.cfg
+            - ./data/sonic:/var/lib/sonic/store
+
     ### Example: To run pihole in order to block ad/tracker requests during archiving,
     # uncomment this block and set up pihole using its admin interface
 

+ 1 - 1
package.json

@@ -8,6 +8,6 @@
   "dependencies": {
     "@postlight/parser": "^2.2.3",
     "readability-extractor": "github:ArchiveBox/readability-extractor",
-    "single-file-cli": "^1.1.46"
+    "single-file-cli": "^1.1.54"
   }
 }

+ 4 - 4
pyproject.toml

@@ -15,15 +15,16 @@ dependencies = [
     "dateparser>=1.0.0",
     "django-extensions>=3.2.3",
     "django>=4.2.0,<5.0",
+    "setuptools>=69.0.3",
     "feedparser>=6.0.11",
     "ipython>5.0.0",
     "mypy-extensions>=0.4.3",
     "python-crontab>=2.5.1",
     "requests>=2.24.0",
     "w3lib>=1.22.0",
-    "yt-dlp>=2023.10.13",
+    "yt-dlp>=2024.3.10",
     #  dont add playwright becuase packages without sdists cause trouble on many build systems that refuse to install wheel-only packages
-    # "playwright>=1.39.0; platform_machine != 'armv7l'",
+    "playwright>=1.39.0; platform_machine != 'armv7l'",
 ]
 
 classifiers = [
@@ -64,11 +65,11 @@ classifiers = [
 sonic = [
     # echo "deb [signed-by=/usr/share/keyrings/valeriansaliou_sonic.gpg] https://packagecloud.io/valeriansaliou/sonic/debian/ bookworm main" > /etc/apt/sources.list.d/valeriansaliou_sonic.list
     # curl -fsSL https://packagecloud.io/valeriansaliou/sonic/gpgkey | gpg --dearmor -o /usr/share/keyrings/valeriansaliou_sonic.gpg
+    # apt install sonic
     "sonic-client>=0.0.5",
 ]
 ldap = [
     # apt install libldap2-dev libsasl2-dev python3-ldap
-    "setuptools>=69.0.3",
     "python-ldap>=3.4.3",
     "django-auth-ldap>=4.1.0",
 ]
@@ -83,7 +84,6 @@ ldap = [
 [tool.pdm.dev-dependencies]
 dev = [
     # building
-    "setuptools>=69.0.3",
     "wheel",
     "pdm",
     "homebrew-pypi-poet>=0.10.0",