Jelajahi Sumber

Merge branch 'main' into dev

Nick Sweeting 1 tahun lalu
induk
melakukan
8dcfa93ec6

+ 12 - 0
.github/dependabot.yml

@@ -0,0 +1,12 @@
+# To get started with Dependabot version updates, you'll need to specify which
+# package ecosystems to update and where the package manifests are located.
+# Please see the documentation for all configuration options:
+# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
+
+version: 2
+updates:
+  - package-ecosystem: "pip" # See documentation for possible values
+    directory: "/"
+    target-branch: "dev"
+    schedule:
+      interval: "weekly"

+ 92 - 0
.github/workflows/codeql.yml

@@ -0,0 +1,92 @@
+# For most projects, this workflow file will not need changing; you simply need
+# to commit it to your repository.
+#
+# You may wish to alter this file to override the set of languages analyzed,
+# or to provide custom queries or build logic.
+#
+# ******** NOTE ********
+# We have attempted to detect the languages in your repository. Please check
+# the `language` matrix defined below to confirm you have the correct set of
+# supported CodeQL languages.
+#
+name: "CodeQL"
+
+on:
+  push:
+    branches: [ "dev" ]
+  pull_request:
+    branches: [ "dev" ]
+  schedule:
+    - cron: '33 17 * * 6'
+
+jobs:
+  analyze:
+    name: Analyze (${{ matrix.language }})
+    # Runner size impacts CodeQL analysis time. To learn more, please see:
+    #   - https://gh.io/recommended-hardware-resources-for-running-codeql
+    #   - https://gh.io/supported-runners-and-hardware-resources
+    #   - https://gh.io/using-larger-runners (GitHub.com only)
+    # Consider using larger runners or machines with greater resources for possible analysis time improvements.
+    runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
+    timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }}
+    permissions:
+      # required for all workflows
+      security-events: write
+
+      # required to fetch internal or private CodeQL packs
+      packages: read
+
+      # only required for workflows in private repositories
+      actions: read
+      contents: read
+
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+        - language: python
+          build-mode: none
+        # CodeQL supports the following values keywords for 'language': 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift'
+        # Use `c-cpp` to analyze code written in C, C++ or both
+        # Use 'java-kotlin' to analyze code written in Java, Kotlin or both
+        # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both
+        # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis,
+        # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning.
+        # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how
+        # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+
+    # Initializes the CodeQL tools for scanning.
+    - name: Initialize CodeQL
+      uses: github/codeql-action/init@v3
+      with:
+        languages: ${{ matrix.language }}
+        build-mode: ${{ matrix.build-mode }}
+        # If you wish to specify custom queries, you can do so here or in a config file.
+        # By default, queries listed here will override any specified in a config file.
+        # Prefix the list here with "+" to use these queries and those in the config file.
+
+        # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
+        # queries: security-extended,security-and-quality
+
+    # If the analyze step fails for one of the languages you are analyzing with
+    # "We were unable to automatically build your code", modify the matrix above
+    # to set the build mode to "manual" for that language. Then modify this step
+    # to build your code.
+    # ℹ️ Command-line programs to run using the OS shell.
+    # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
+    - if: matrix.build-mode == 'manual'
+      run: |
+        echo 'If you are using a "manual" build mode for one or more of the' \
+          'languages you are analyzing, replace this with the commands to build' \
+          'your code, for example:'
+        echo '  make bootstrap'
+        echo '  make release'
+        exit 1
+
+    - name: Perform CodeQL Analysis
+      uses: github/codeql-action/analyze@v3
+      with:
+        category: "/language:${{matrix.language}}"

+ 4 - 3
README.md

@@ -124,8 +124,8 @@ curl -fsSL 'https://get.archivebox.io' | sh
 
 ## Key Features
 
-- [**Free & open source**](https://github.com/ArchiveBox/ArchiveBox/blob/dev/LICENSE), doesn't require signing up online, stores all data locally
-- [**Powerful, intuitive command line interface**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage) with [modular optional dependencies](#dependencies) 
+- [**Free & open source**](https://github.com/ArchiveBox/ArchiveBox/blob/dev/LICENSE), own your own data & maintain your privacy by self-hosting
+- [**Powerful command line interface**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage) with [modular dependencies](#dependencies) and [support for Google Drive/NFS/SMB/S3/B2/etc.](https://github.com/ArchiveBox/ArchiveBox/wiki/Setting-Up-Storage)
 - [**Comprehensive documentation**](https://github.com/ArchiveBox/ArchiveBox/wiki), [active development](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap), and [rich community](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community)
 - [**Extracts a wide variety of content out-of-the-box**](https://github.com/ArchiveBox/ArchiveBox/issues/51): [media (yt-dlp), articles (readability), code (git), etc.](#output-formats)
 - [**Supports scheduled/realtime importing**](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving) from [many types of sources](#input-formats)
@@ -669,7 +669,7 @@ docker run -it -v $PWD:/data archivebox/archivebox add --depth=1 'https://exampl
 ```bash
 # archivebox add --help
 archivebox add 'https://example.com/some/page'
-archivebox add < ~/Downloads/firefox_bookmarks_export.html
+archivebox add --parser=generic_rss < ~/Downloads/some_feed.xml
 archivebox add --depth=1 'https://news.ycombinator.com#2020-12-12'
 echo 'http://example.com' | archivebox add
 echo 'any text with <a href="https://example.com">urls</a> in it' | archivebox add
@@ -865,6 +865,7 @@ Each snapshot subfolder <code>data/archive/TIMESTAMP/</code> includes a static <
 
 <h4>Learn More</h4>
 <ul>
+<li><a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Setting-Up-Storage">Wiki: Setting Up Storage (SMB, NFS, S3, B2, Google Drive, etc.)</a></li>
 <li><a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Disk-Layout">Wiki: Usage (Disk Layout)</a></li>
 <li><a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#large-archives">Wiki: Usage (Large Archives)</a></li>
 <li><a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#output-folder">Wiki: Security Overview (Output Folder)</a></li>

+ 0 - 0
archivebox/api/__init__.py


+ 5 - 0
archivebox/api/apps.py

@@ -0,0 +1,5 @@
+from django.apps import AppConfig
+
+
+class APIConfig(AppConfig):
+    name = 'api'

+ 184 - 0
archivebox/api/archive.py

@@ -0,0 +1,184 @@
+# archivebox_api.py
+from typing import List, Optional
+from enum import Enum
+from pydantic import BaseModel
+from ninja import Router
+from main import (
+    add,
+    remove,
+    update,
+    list_all,
+    ONLY_NEW,
+)  # Assuming these functions are defined in main.py
+
+
+# Schemas
+
+class StatusChoices(str, Enum):
+    indexed = 'indexed'
+    archived = 'archived'
+    unarchived = 'unarchived'
+    present = 'present'
+    valid = 'valid'
+    invalid = 'invalid'
+    duplicate = 'duplicate'
+    orphaned = 'orphaned'
+    corrupted = 'corrupted'
+    unrecognized = 'unrecognized'
+
+
+class AddURLSchema(BaseModel):
+    urls: List[str]
+    tag: str = ""
+    depth: int = 0
+    update: bool = not ONLY_NEW  # Default to the opposite of ONLY_NEW
+    update_all: bool = False
+    index_only: bool = False
+    overwrite: bool = False
+    init: bool = False
+    extractors: str = ""
+    parser: str = "auto"
+
+
+class RemoveURLSchema(BaseModel):
+    yes: bool = False
+    delete: bool = False
+    before: Optional[float] = None
+    after: Optional[float] = None
+    filter_type: str = "exact"
+    filter_patterns: Optional[List[str]] = None
+
+
+class UpdateSchema(BaseModel):
+    resume: Optional[float] = None
+    only_new: Optional[bool] = None
+    index_only: Optional[bool] = False
+    overwrite: Optional[bool] = False
+    before: Optional[float] = None
+    after: Optional[float] = None
+    status: Optional[StatusChoices] = None
+    filter_type: Optional[str] = 'exact'
+    filter_patterns: Optional[List[str]] = None
+    extractors: Optional[str] = ""
+
+
+class ListAllSchema(BaseModel):
+    filter_patterns: Optional[List[str]] = None
+    filter_type: str = 'exact'
+    status: Optional[StatusChoices] = None
+    after: Optional[float] = None
+    before: Optional[float] = None
+    sort: Optional[str] = None
+    csv: Optional[str] = None
+    json: bool = False
+    html: bool = False
+    with_headers: bool = False
+
+
+# API Router
+router = Router()
+
+
[email protected]("/add", response={200: dict})
+def api_add(request, payload: AddURLSchema):
+    try:
+        result = add(
+            urls=payload.urls,
+            tag=payload.tag,
+            depth=payload.depth,
+            update=payload.update,
+            update_all=payload.update_all,
+            index_only=payload.index_only,
+            overwrite=payload.overwrite,
+            init=payload.init,
+            extractors=payload.extractors,
+            parser=payload.parser,
+        )
+        # Currently the add function returns a list of ALL items in the DB, ideally only return new items
+        return {
+            "status": "success",
+            "message": "URLs added successfully.",
+            "result": str(result),
+        }
+    except Exception as e:
+        # Handle exceptions raised by the add function or during processing
+        return {"status": "error", "message": str(e)}
+
+
[email protected]("/remove", response={200: dict})
+def api_remove(request, payload: RemoveURLSchema):
+    try:
+        result = remove(
+            yes=payload.yes,
+            delete=payload.delete,
+            before=payload.before,
+            after=payload.after,
+            filter_type=payload.filter_type,
+            filter_patterns=payload.filter_patterns,
+        )
+        return {
+            "status": "success",
+            "message": "URLs removed successfully.",
+            "result": result,
+        }
+    except Exception as e:
+        # Handle exceptions raised by the remove function or during processing
+        return {"status": "error", "message": str(e)}
+
+
[email protected]("/update", response={200: dict})
+def api_update(request, payload: UpdateSchema):
+    try:
+        result = update(
+            resume=payload.resume,
+            only_new=payload.only_new,
+            index_only=payload.index_only,
+            overwrite=payload.overwrite,
+            before=payload.before,
+            after=payload.after,
+            status=payload.status,
+            filter_type=payload.filter_type,
+            filter_patterns=payload.filter_patterns,
+            extractors=payload.extractors,
+        )
+        return {
+            "status": "success",
+            "message": "Archive updated successfully.",
+            "result": result,
+        }
+    except Exception as e:
+        # Handle exceptions raised by the update function or during processing
+        return {"status": "error", "message": str(e)}
+
+
[email protected]("/list_all", response={200: dict})
+def api_list_all(request, payload: ListAllSchema):
+    try:
+        result = list_all(
+            filter_patterns=payload.filter_patterns,
+            filter_type=payload.filter_type,
+            status=payload.status,
+            after=payload.after,
+            before=payload.before,
+            sort=payload.sort,
+            csv=payload.csv,
+            json=payload.json,
+            html=payload.html,
+            with_headers=payload.with_headers,
+        )
+        # TODO: This is kind of bad, make the format a choice field
+        if payload.json:
+            return {"status": "success", "format": "json", "data": result}
+        elif payload.html:
+            return {"status": "success", "format": "html", "data": result}
+        elif payload.csv:
+            return {"status": "success", "format": "csv", "data": result}
+        else:
+            return {
+                "status": "success",
+                "message": "List generated successfully.",
+                "data": result,
+            }
+    except Exception as e:
+        # Handle exceptions raised by the list_all function or during processing
+        return {"status": "error", "message": str(e)}

+ 48 - 0
archivebox/api/auth.py

@@ -0,0 +1,48 @@
+from django.contrib.auth import authenticate
+from ninja import Form, Router, Schema
+from ninja.security import HttpBearer
+
+from api.models import Token
+
+router = Router()
+
+
+class GlobalAuth(HttpBearer):
+    def authenticate(self, request, token):
+        try:
+            return Token.objects.get(token=token).user
+        except Token.DoesNotExist:
+            pass
+
+
+class AuthSchema(Schema):
+    email: str
+    password: str
+
+
[email protected]("/authenticate", auth=None)  # overriding global auth
+def get_token(request, auth_data: AuthSchema):
+    user = authenticate(username=auth_data.email, password=auth_data.password)
+    if user:
+        # Assuming a user can have multiple tokens and you want to create a new one every time
+        new_token = Token.objects.create(user=user)
+        return {"token": new_token.token, "expires": new_token.expiry_as_iso8601}
+    else:
+        return {"error": "Invalid credentials"}
+
+
+class TokenValidationSchema(Schema):
+    token: str
+
+
[email protected]("/validate_token", auth=None) # No authentication required for this endpoint
+def validate_token(request, token_data: TokenValidationSchema):
+    try:
+        # Attempt to authenticate using the provided token
+        user = GlobalAuth().authenticate(request, token_data.token)
+        if user:
+            return {"status": "valid"}
+        else:
+            return {"status": "invalid"}
+    except Token.DoesNotExist:
+        return {"status": "invalid"}

+ 28 - 0
archivebox/api/migrations/0001_initial.py

@@ -0,0 +1,28 @@
+# Generated by Django 3.1.14 on 2024-04-09 18:52
+
+import api.models
+from django.conf import settings
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    initial = True
+
+    dependencies = [
+        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='Token',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('token', models.CharField(default=auth.models.hex_uuid, max_length=32, unique=True)),
+                ('created', models.DateTimeField(auto_now_add=True)),
+                ('expiry', models.DateTimeField(blank=True, null=True)),
+                ('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='tokens', to=settings.AUTH_USER_MODEL)),
+            ],
+        ),
+    ]

+ 0 - 0
archivebox/api/migrations/__init__.py


+ 30 - 0
archivebox/api/models.py

@@ -0,0 +1,30 @@
+import uuid
+from datetime import timedelta
+
+from django.conf import settings
+from django.db import models
+from django.utils import timezone
+from django.utils.translation import gettext_lazy as _
+
+def hex_uuid():
+    return uuid.uuid4().hex
+
+
+class Token(models.Model):
+    user = models.ForeignKey(
+        settings.AUTH_USER_MODEL, on_delete=models.CASCADE, related_name="tokens"
+    )
+    token = models.CharField(max_length=32, default=hex_uuid, unique=True)
+    created = models.DateTimeField(auto_now_add=True)
+    expiry = models.DateTimeField(null=True, blank=True)
+
+    @property
+    def expiry_as_iso8601(self):
+        """Returns the expiry date of the token in ISO 8601 format or a date 100 years in the future if none."""
+        expiry_date = (
+            self.expiry if self.expiry else timezone.now() + timedelta(days=365 * 100)
+        )
+        return expiry_date.isoformat()
+
+    def __str__(self):
+        return self.token

+ 27 - 0
archivebox/api/tests.py

@@ -0,0 +1,27 @@
+from django.test import TestCase
+from ninja.testing import TestClient
+from archivebox.api.archive import router as archive_router
+
+class ArchiveBoxAPITestCase(TestCase):
+    def setUp(self):
+        self.client = TestClient(archive_router)
+
+    def test_add_endpoint(self):
+        response = self.client.post("/add", json={"urls": ["http://example.com"], "tag": "test"})
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.json()["status"], "success")
+
+    def test_remove_endpoint(self):
+        response = self.client.post("/remove", json={"filter_patterns": ["http://example.com"]})
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.json()["status"], "success")
+
+    def test_update_endpoint(self):
+        response = self.client.post("/update", json={})
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.json()["status"], "success")
+
+    def test_list_all_endpoint(self):
+        response = self.client.post("/list_all", json={})
+        self.assertEqual(response.status_code, 200)
+        self.assertTrue("success" in response.json()["status"])

+ 1 - 0
archivebox/core/settings.py

@@ -61,6 +61,7 @@ INSTALLED_APPS = [
     'django.contrib.admin',
 
     'core',
+    'api',
 
     'django_extensions',
 ]

+ 14 - 0
archivebox/core/urls.py

@@ -8,6 +8,18 @@ from django.views.generic.base import RedirectView
 
 from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
 
+from ninja import NinjaAPI
+from api.auth import GlobalAuth
+
+api = NinjaAPI(auth=GlobalAuth())
+api.add_router("/auth/", "api.auth.router")
+api.add_router("/archive/", "api.archive.router")
+
+# GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
+# from config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
+# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE}
+
+
 # print('DEBUG', settings.DEBUG)
 
 urlpatterns = [
@@ -31,6 +43,8 @@ urlpatterns = [
     path('accounts/', include('django.contrib.auth.urls')),
     path('admin/', archivebox_admin.urls),
     
+    path("api/", api.urls),
+
     path('health/', HealthCheckView.as_view(), name='healthcheck'),
     path('error/', lambda _: 1/0),
 

+ 0 - 0
archivebox/index.sqlite3


+ 37 - 1
docker-compose.yml

@@ -135,9 +135,45 @@ services:
     #     - ./etc/dnsmasq:/etc/dnsmasq.d
 
 
+    ### Example: Enable ability to run regularly scheduled archiving tasks by uncommenting this container
+    #   $ docker compose run archivebox schedule --every=day --depth=1 'https://example.com/some/rss/feed.xml'
+    # then restart the scheduler container to apply the changes to the schedule
+    #   $ docker compose restart archivebox_scheduler
+
+    # archivebox_scheduler:
+    #    image: archivebox/archivebox:latest
+    #    command: schedule --foreground
+    #    environment:
+    #        - MEDIA_MAX_SIZE=750m               # increase this number to allow archiving larger audio/video files
+    #        # - TIMEOUT=60                      # increase if you see timeouts often during archiving / on slow networks
+    #        # - ONLY_NEW=True                   # set to False to retry previously failed URLs when re-adding instead of skipping them
+    #        # - CHECK_SSL_VALIDITY=True         # set to False to allow saving URLs w/ broken SSL certs
+    #        # - SAVE_ARCHIVE_DOT_ORG=True       # set to False to disable submitting URLs to Archive.org when archiving
+    #        # - PUID=502                        # set to your host user's UID & GID if you encounter permissions issues
+    #        # - PGID=20
+    #    volumes:
+    #        - ./data:/data
+    #        - ./etc/crontabs:/var/spool/cron/crontabs
+    #    # cpus: 2                               # uncomment / edit these values to limit container resource consumption
+    #    # mem_limit: 2048m
+    #    # shm_size: 1024m
+
+
+    ### Example: Put Nginx in front of the ArchiveBox server for SSL termination
+
+    # nginx:
+    #     image: nginx:alpine
+    #     ports:
+    #         - 443:443
+    #         - 80:80
+    #     volumes:
+    #         - ./etc/nginx.conf:/etc/nginx/nginx.conf
+    #         - ./data:/var/www
+
+
     ### Example: run all your ArchiveBox traffic through a WireGuard VPN tunnel to avoid IP blocks.
     # You can also use any other VPN that works at the docker IP level, e.g. Tailscale, OpenVPN, etc.
-
+    
     # wireguard:
     #   image: linuxserver/wireguard:latest
     #   network_mode: 'service:archivebox'

+ 10 - 0
pdm.lock

@@ -7,6 +7,16 @@ strategy = ["cross_platform", "inherit_metadata"]
 lock_version = "4.4.1"
 content_hash = "sha256:a2483b801ba2cb7748849f80e9030d949728ea3686eb023dc333b5a99f610874"
 
+[[package]]
+name = "annotated-types"
+version = "0.6.0"
+requires_python = ">=3.8"
+summary = "Reusable constraint types to use with typing.Annotated"
+files = [
+    {file = "annotated_types-0.6.0-py3-none-any.whl", hash = "sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43"},
+    {file = "annotated_types-0.6.0.tar.gz", hash = "sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d"},
+]
+
 [[package]]
 name = "asgiref"
 version = "3.8.1"

+ 1 - 0
pyproject.toml

@@ -15,6 +15,7 @@ dependencies = [
     # Base Framework and Language Dependencies
     "setuptools>=69.5.1",
     "django>=4.2.0,<5.0",
+    "django-ninja>=1.1.0",
     "django-extensions>=3.2.3",
     "mypy-extensions>=1.0.0",
 

+ 6 - 0
requirements.txt

@@ -3,6 +3,8 @@
 
 asgiref==3.8.1
 asttokens==2.4.1
+babel==2.14.0
+blinker==1.7.0
 brotli==1.1.0; implementation_name == "cpython"
 brotlicffi==1.1.0.0; implementation_name != "cpython"
 certifi==2024.2.2
@@ -24,6 +26,7 @@ ipython==8.23.0
 jedi==0.19.1
 matplotlib-inline==0.1.7
 mutagen==1.47.0
+mypy==1.8.0
 mypy-extensions==1.0.0
 parso==0.8.4
 pexpect==4.9.0; sys_platform != "win32" and sys_platform != "emscripten"
@@ -37,6 +40,8 @@ pycparser==2.22; implementation_name != "cpython"
 pycryptodomex==3.20.0
 pyee==11.1.0; platform_machine != "armv7l"
 pygments==2.17.2
+pyproject-hooks==1.0.0
+pytest==7.4.4
 python-crontab==3.0.0
 python-dateutil==2.9.0.post0
 python-ldap==3.4.4
@@ -46,6 +51,7 @@ requests==2.31.0
 setuptools==69.5.1
 sgmllib3k==1.0.0
 six==1.16.0
+snowballstemmer==2.2.0
 sonic-client==1.0.0
 sqlparse==0.5.0
 stack-data==0.6.3