2 months ago · 28e6c5bb65
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -3,7 +3,10 @@
 
				     "allow": [
			
 
				       "Bash(python -m archivebox:*)",
			
 
				       "Bash(ls:*)",
			
 
				-      "Bash(xargs:*)"
			
 
				+      "Bash(xargs:*)",
			
 
				+      "Bash(python -c:*)",
			
 
				+      "Bash(printf:*)",
			
 
				+      "Bash(pkill:*)"
			
 
				     ]
			
 
				   }
			
 
				 }
			
--- a/archivebox/cli/archivebox_mcp.py
+++ b/archivebox/cli/archivebox_mcp.py
@@ -0,0 +1,49 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+archivebox mcp
			
 
				+
			
 
				+Start the Model Context Protocol (MCP) server in stdio mode.
			
 
				+Exposes all ArchiveBox CLI commands as MCP tools for AI agents.
			
 
				+"""
			
 
				+
			
 
				+__package__ = 'archivebox.cli'
			
 
				+__command__ = 'archivebox mcp'
			
 
				+
			
 
				+import rich_click as click
			
 
				+
			
 
				+from archivebox.misc.util import docstring, enforce_types
			
 
				+
			
 
				+
			
 
				+@enforce_types
			
 
				+def mcp():
			
 
				+    """
			
 
				+    Start the MCP server in stdio mode for AI agent control.
			
 
				+
			
 
				+    The MCP (Model Context Protocol) server exposes all ArchiveBox CLI commands
			
 
				+    as tools that AI agents can discover and execute. It communicates via JSON-RPC
			
 
				+    2.0 over stdin/stdout.
			
 
				+
			
 
				+    Example usage with an MCP client:
			
 
				+        archivebox mcp < requests.jsonl > responses.jsonl
			
 
				+
			
 
				+    Or interactively:
			
 
				+        archivebox mcp
			
 
				+        {"jsonrpc":"2.0","id":1,"method":"initialize","params":{}}
			
 
				+        {"jsonrpc":"2.0","id":2,"method":"tools/list","params":{}}
			
 
				+    """
			
 
				+
			
 
				+    from mcp.server import run_mcp_server
			
 
				+
			
 
				+    # Run the stdio server (blocks until stdin closes)
			
 
				+    run_mcp_server()
			
 
				+
			
 
				+
			
 
				[email protected]()
			
 
				+@docstring(mcp.__doc__)
			
 
				+def main(**kwargs):
			
 
				+    """Start the MCP server in stdio mode"""
			
 
				+    mcp()
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/archivebox/mcp/README.md
+++ b/archivebox/mcp/README.md
@@ -0,0 +1,138 @@
 
				+# ArchiveBox MCP Server
			
 
				+
			
 
				+Model Context Protocol (MCP) server for ArchiveBox that exposes all CLI commands as tools for AI agents.
			
 
				+
			
 
				+## Overview
			
 
				+
			
 
				+This is a lightweight, stateless MCP server that dynamically introspects ArchiveBox's Click CLI commands and exposes them as MCP tools. It requires **zero manual schema definitions** - everything is auto-generated from the existing CLI metadata.
			
 
				+
			
 
				+## Features
			
 
				+
			
 
				+- ✅ **Auto-discovery**: Dynamically discovers all 19+ ArchiveBox CLI commands
			
 
				+- ✅ **Zero duplication**: Reuses existing Click command definitions, types, and help text
			
 
				+- ✅ **Auto-sync**: Changes to CLI commands automatically reflected in MCP tools
			
 
				+- ✅ **Stateless**: No database models or state management required
			
 
				+- ✅ **Lightweight**: ~200 lines of code
			
 
				+
			
 
				+## Usage
			
 
				+
			
 
				+### Start the MCP Server
			
 
				+
			
 
				+```bash
			
 
				+archivebox mcp
			
 
				+```
			
 
				+
			
 
				+The server runs in stdio mode, reading JSON-RPC 2.0 requests from stdin and writing responses to stdout.
			
 
				+
			
 
				+### Example Client
			
 
				+
			
 
				+```python
			
 
				+import subprocess
			
 
				+import json
			
 
				+
			
 
				+# Start MCP server
			
 
				+proc = subprocess.Popen(
			
 
				+    ['archivebox', 'mcp'],
			
 
				+    stdin=subprocess.PIPE,
			
 
				+    stdout=subprocess.PIPE,
			
 
				+    text=True
			
 
				+)
			
 
				+
			
 
				+# Send initialize request
			
 
				+request = {"jsonrpc": "2.0", "id": 1, "method": "initialize", "params": {}}
			
 
				+proc.stdin.write(json.dumps(request) + '\n')
			
 
				+proc.stdin.flush()
			
 
				+
			
 
				+# Read response
			
 
				+response = json.loads(proc.stdout.readline())
			
 
				+print(response)
			
 
				+```
			
 
				+
			
 
				+### Example Requests
			
 
				+
			
 
				+**Initialize:**
			
 
				+```json
			
 
				+{"jsonrpc":"2.0","id":1,"method":"initialize","params":{}}
			
 
				+```
			
 
				+
			
 
				+**List all available tools:**
			
 
				+```json
			
 
				+{"jsonrpc":"2.0","id":2,"method":"tools/list","params":{}}
			
 
				+```
			
 
				+
			
 
				+**Call a tool:**
			
 
				+```json
			
 
				+{
			
 
				+  "jsonrpc":"2.0",
			
 
				+  "id":3,
			
 
				+  "method":"tools/call",
			
 
				+  "params":{
			
 
				+    "name":"version",
			
 
				+    "arguments":{"quiet":true}
			
 
				+  }
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+## Supported MCP Methods
			
 
				+
			
 
				+- `initialize` - Handshake and capability negotiation
			
 
				+- `tools/list` - List all available CLI commands as MCP tools
			
 
				+- `tools/call` - Execute a CLI command with arguments
			
 
				+
			
 
				+## Available Tools
			
 
				+
			
 
				+The server exposes all ArchiveBox CLI commands:
			
 
				+
			
 
				+**Meta**: `help`, `version`, `mcp`
			
 
				+**Setup**: `init`, `install`
			
 
				+**Archive**: `add`, `remove`, `update`, `search`, `status`, `config`
			
 
				+**Workers**: `orchestrator`, `worker`
			
 
				+**Tasks**: `crawl`, `snapshot`, `extract`
			
 
				+**Server**: `server`, `schedule`
			
 
				+**Utilities**: `shell`, `manage`
			
 
				+
			
 
				+## Architecture
			
 
				+
			
 
				+### Dynamic Introspection
			
 
				+
			
 
				+Instead of manually defining schemas, the server uses Click's introspection API to automatically generate MCP tool definitions:
			
 
				+
			
 
				+```python
			
 
				+# Auto-discover commands
			
 
				+from archivebox.cli import ArchiveBoxGroup
			
 
				+cli_group = ArchiveBoxGroup()
			
 
				+all_commands = cli_group.all_subcommands
			
 
				+
			
 
				+# Auto-generate schemas from Click metadata
			
 
				+for cmd_name in all_commands:
			
 
				+    click_cmd = cli_group.get_command(None, cmd_name)
			
 
				+    # Extract params, types, help text, etc.
			
 
				+    tool_schema = click_command_to_mcp_tool(cmd_name, click_cmd)
			
 
				+```
			
 
				+
			
 
				+### Tool Execution
			
 
				+
			
 
				+Commands are executed using Click's `CliRunner`:
			
 
				+
			
 
				+```python
			
 
				+from click.testing import CliRunner
			
 
				+
			
 
				+runner = CliRunner()
			
 
				+result = runner.invoke(click_command, args)
			
 
				+```
			
 
				+
			
 
				+## Files
			
 
				+
			
 
				+- `server.py` (~350 lines) - Core MCP server with Click introspection
			
 
				+- `archivebox/cli/archivebox_mcp.py` (~50 lines) - CLI entry point
			
 
				+- `apps.py`, `__init__.py` - Django app boilerplate
			
 
				+
			
 
				+## MCP Specification
			
 
				+
			
 
				+Implements the [MCP 2025-11-25 specification](https://modelcontextprotocol.io/specification/2025-11-25).
			
 
				+
			
 
				+## Sources
			
 
				+
			
 
				+- [MCP Specification](https://modelcontextprotocol.io/specification/2025-11-25)
			
 
				+- [MCP Introduction](https://www.anthropic.com/news/model-context-protocol)
			
 
				+- [MCP GitHub](https://github.com/modelcontextprotocol/modelcontextprotocol)
			
--- a/archivebox/mcp/__init__.py
+++ b/archivebox/mcp/__init__.py
@@ -0,0 +1,8 @@
 
				+__package__ = 'archivebox.mcp'
			
 
				+
			
 
				+"""
			
 
				+Model Context Protocol (MCP) server for ArchiveBox.
			
 
				+
			
 
				+Exposes all ArchiveBox CLI commands as MCP tools via dynamic Click introspection.
			
 
				+Provides a JSON-RPC 2.0 interface over stdio for AI agents to control ArchiveBox.
			
 
				+"""
			
--- a/archivebox/mcp/apps.py
+++ b/archivebox/mcp/apps.py
@@ -0,0 +1,9 @@
 
				+__package__ = 'archivebox.mcp'
			
 
				+
			
 
				+from django.apps import AppConfig
			
 
				+
			
 
				+
			
 
				+class MCPConfig(AppConfig):
			
 
				+    name = 'mcp'
			
 
				+    verbose_name = 'Model Context Protocol Server'
			
 
				+    default_auto_field = 'django.db.models.BigAutoField'
			
--- a/archivebox/mcp/server.py
+++ b/archivebox/mcp/server.py
@@ -0,0 +1,353 @@
 
				+__package__ = 'archivebox.mcp'
			
 
				+
			
 
				+"""
			
 
				+Model Context Protocol (MCP) server implementation for ArchiveBox.
			
 
				+
			
 
				+Dynamically exposes all ArchiveBox CLI commands as MCP tools by introspecting
			
 
				+Click command metadata. Handles JSON-RPC 2.0 requests over stdio transport.
			
 
				+"""
			
 
				+
			
 
				+import sys
			
 
				+import json
			
 
				+import traceback
			
 
				+from typing import Any, Dict, List, Optional
			
 
				+from io import StringIO
			
 
				+from contextlib import redirect_stdout, redirect_stderr
			
 
				+
			
 
				+import click
			
 
				+from click.testing import CliRunner
			
 
				+
			
 
				+from archivebox.config.version import VERSION
			
 
				+
			
 
				+
			
 
				+class MCPJSONEncoder(json.JSONEncoder):
			
 
				+    """Custom JSON encoder that handles Click sentinel values and other special types"""
			
 
				+
			
 
				+    def default(self, obj):
			
 
				+        # Handle Click's sentinel values
			
 
				+        if hasattr(click, 'core') and hasattr(click.core, '_SentinelClass'):
			
 
				+            if isinstance(obj, click.core._SentinelClass):
			
 
				+                return None
			
 
				+
			
 
				+        # Handle tuples (convert to lists)
			
 
				+        if isinstance(obj, tuple):
			
 
				+            return list(obj)
			
 
				+
			
 
				+        # Handle any other non-serializable objects
			
 
				+        try:
			
 
				+            return super().default(obj)
			
 
				+        except TypeError:
			
 
				+            return str(obj)
			
 
				+
			
 
				+
			
 
				+# Type mapping from Click types to JSON Schema types
			
 
				+def click_type_to_json_schema_type(click_type) -> dict:
			
 
				+    """Convert a Click parameter type to JSON Schema type definition"""
			
 
				+
			
 
				+    if isinstance(click_type, click.types.StringParamType):
			
 
				+        return {"type": "string"}
			
 
				+    elif isinstance(click_type, click.types.IntParamType):
			
 
				+        return {"type": "integer"}
			
 
				+    elif isinstance(click_type, click.types.FloatParamType):
			
 
				+        return {"type": "number"}
			
 
				+    elif isinstance(click_type, click.types.BoolParamType):
			
 
				+        return {"type": "boolean"}
			
 
				+    elif isinstance(click_type, click.types.Choice):
			
 
				+        return {"type": "string", "enum": click_type.choices}
			
 
				+    elif isinstance(click_type, click.types.Path):
			
 
				+        return {"type": "string", "description": "File or directory path"}
			
 
				+    elif isinstance(click_type, click.types.File):
			
 
				+        return {"type": "string", "description": "File path"}
			
 
				+    elif isinstance(click_type, click.types.Tuple):
			
 
				+        # Multiple arguments of same type
			
 
				+        return {"type": "array", "items": {"type": "string"}}
			
 
				+    else:
			
 
				+        # Default to string for unknown types
			
 
				+        return {"type": "string"}
			
 
				+
			
 
				+
			
 
				+def click_command_to_mcp_tool(cmd_name: str, click_command: click.Command) -> dict:
			
 
				+    """
			
 
				+    Convert a Click command to an MCP tool definition with JSON Schema.
			
 
				+
			
 
				+    Introspects the Click command's parameters to automatically generate
			
 
				+    the input schema without manual definition.
			
 
				+    """
			
 
				+
			
 
				+    properties = {}
			
 
				+    required = []
			
 
				+
			
 
				+    # Extract parameters from Click command
			
 
				+    for param in click_command.params:
			
 
				+        # Skip internal parameters
			
 
				+        if param.name in ('help', 'version'):
			
 
				+            continue
			
 
				+
			
 
				+        param_schema = click_type_to_json_schema_type(param.type)
			
 
				+
			
 
				+        # Add description from Click help text
			
 
				+        if param.help:
			
 
				+            param_schema["description"] = param.help
			
 
				+
			
 
				+        # Handle default values
			
 
				+        if param.default is not None and param.default != ():
			
 
				+            param_schema["default"] = param.default
			
 
				+
			
 
				+        # Handle multiple values (like multiple URLs)
			
 
				+        if param.multiple:
			
 
				+            properties[param.name] = {
			
 
				+                "type": "array",
			
 
				+                "items": param_schema,
			
 
				+                "description": param_schema.get("description", f"Multiple {param.name} values")
			
 
				+            }
			
 
				+        else:
			
 
				+            properties[param.name] = param_schema
			
 
				+
			
 
				+        # Mark as required if Click requires it
			
 
				+        if param.required:
			
 
				+            required.append(param.name)
			
 
				+
			
 
				+    return {
			
 
				+        "name": cmd_name,
			
 
				+        "description": click_command.help or click_command.short_help or f"Run archivebox {cmd_name} command",
			
 
				+        "inputSchema": {
			
 
				+            "type": "object",
			
 
				+            "properties": properties,
			
 
				+            "required": required
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def execute_click_command(cmd_name: str, click_command: click.Command, arguments: dict) -> dict:
			
 
				+    """
			
 
				+    Execute a Click command programmatically with given arguments.
			
 
				+
			
 
				+    Returns MCP-formatted result with captured output and error status.
			
 
				+    """
			
 
				+
			
 
				+    # Use Click's test runner to invoke command programmatically
			
 
				+    runner = CliRunner()
			
 
				+
			
 
				+    # Convert arguments dict to CLI args list
			
 
				+    args = []
			
 
				+    for key, value in arguments.items():
			
 
				+        param_name = key.replace('_', '-')  # Click uses dashes
			
 
				+
			
 
				+        if isinstance(value, bool):
			
 
				+            if value:
			
 
				+                args.append(f'--{param_name}')
			
 
				+        elif isinstance(value, list):
			
 
				+            # Multiple values (e.g., multiple URLs)
			
 
				+            for item in value:
			
 
				+                args.append(str(item))
			
 
				+        elif value is not None:
			
 
				+            args.append(f'--{param_name}')
			
 
				+            args.append(str(value))
			
 
				+
			
 
				+    # Execute the command
			
 
				+    try:
			
 
				+        result = runner.invoke(click_command, args, catch_exceptions=False)
			
 
				+
			
 
				+        # Format output as MCP content
			
 
				+        content = []
			
 
				+
			
 
				+        if result.output:
			
 
				+            content.append({
			
 
				+                "type": "text",
			
 
				+                "text": result.output
			
 
				+            })
			
 
				+
			
 
				+        if result.stderr_bytes:
			
 
				+            stderr_text = result.stderr_bytes.decode('utf-8', errors='replace')
			
 
				+            if stderr_text.strip():
			
 
				+                content.append({
			
 
				+                    "type": "text",
			
 
				+                    "text": f"[stderr]\n{stderr_text}"
			
 
				+                })
			
 
				+
			
 
				+        # Check exit code
			
 
				+        is_error = result.exit_code != 0
			
 
				+
			
 
				+        if is_error and not content:
			
 
				+            content.append({
			
 
				+                "type": "text",
			
 
				+                "text": f"Command failed with exit code {result.exit_code}"
			
 
				+            })
			
 
				+
			
 
				+        return {
			
 
				+            "content": content or [{"type": "text", "text": "(no output)"}],
			
 
				+            "isError": is_error
			
 
				+        }
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        # Capture any exceptions during execution
			
 
				+        error_trace = traceback.format_exc()
			
 
				+        return {
			
 
				+            "content": [{
			
 
				+                "type": "text",
			
 
				+                "text": f"Error executing {cmd_name}: {str(e)}\n\n{error_trace}"
			
 
				+            }],
			
 
				+            "isError": True
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+class MCPServer:
			
 
				+    """
			
 
				+    Model Context Protocol server for ArchiveBox.
			
 
				+
			
 
				+    Provides JSON-RPC 2.0 interface over stdio, dynamically exposing
			
 
				+    all Click commands as MCP tools.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        # Import here to avoid circular imports
			
 
				+        from archivebox.cli import ArchiveBoxGroup
			
 
				+
			
 
				+        self.cli_group = ArchiveBoxGroup()
			
 
				+        self.protocol_version = "2025-11-25"
			
 
				+        self._tool_cache = {}  # Cache loaded Click commands
			
 
				+
			
 
				+    def get_click_command(self, cmd_name: str) -> Optional[click.Command]:
			
 
				+        """Get a Click command by name, with caching"""
			
 
				+        if cmd_name not in self._tool_cache:
			
 
				+            if cmd_name not in self.cli_group.all_subcommands:
			
 
				+                return None
			
 
				+            self._tool_cache[cmd_name] = self.cli_group.get_command(None, cmd_name)
			
 
				+        return self._tool_cache[cmd_name]
			
 
				+
			
 
				+    def handle_initialize(self, params: dict) -> dict:
			
 
				+        """Handle MCP initialize request"""
			
 
				+        return {
			
 
				+            "protocolVersion": self.protocol_version,
			
 
				+            "capabilities": {
			
 
				+                "tools": {}
			
 
				+            },
			
 
				+            "serverInfo": {
			
 
				+                "name": "archivebox-mcp",
			
 
				+                "version": VERSION
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+    def handle_tools_list(self, params: dict) -> dict:
			
 
				+        """Handle MCP tools/list request - returns all available CLI commands as tools"""
			
 
				+        tools = []
			
 
				+
			
 
				+        for cmd_name in self.cli_group.all_subcommands.keys():
			
 
				+            click_cmd = self.get_click_command(cmd_name)
			
 
				+            if click_cmd:
			
 
				+                try:
			
 
				+                    tool_def = click_command_to_mcp_tool(cmd_name, click_cmd)
			
 
				+                    tools.append(tool_def)
			
 
				+                except Exception as e:
			
 
				+                    # Log but don't fail - skip problematic commands
			
 
				+                    print(f"Warning: Could not generate tool for {cmd_name}: {e}", file=sys.stderr)
			
 
				+
			
 
				+        return {"tools": tools}
			
 
				+
			
 
				+    def handle_tools_call(self, params: dict) -> dict:
			
 
				+        """Handle MCP tools/call request - executes a CLI command"""
			
 
				+        tool_name = params.get('name')
			
 
				+        arguments = params.get('arguments', {})
			
 
				+
			
 
				+        if not tool_name:
			
 
				+            raise ValueError("Missing required parameter: name")
			
 
				+
			
 
				+        click_cmd = self.get_click_command(tool_name)
			
 
				+        if not click_cmd:
			
 
				+            raise ValueError(f"Unknown tool: {tool_name}")
			
 
				+
			
 
				+        # Execute the command and return MCP-formatted result
			
 
				+        return execute_click_command(tool_name, click_cmd, arguments)
			
 
				+
			
 
				+    def handle_request(self, request: dict) -> dict:
			
 
				+        """
			
 
				+        Handle a JSON-RPC 2.0 request and return response.
			
 
				+
			
 
				+        Supports MCP methods: initialize, tools/list, tools/call
			
 
				+        """
			
 
				+
			
 
				+        method = request.get('method')
			
 
				+        params = request.get('params', {})
			
 
				+        request_id = request.get('id')
			
 
				+
			
 
				+        try:
			
 
				+            # Route to appropriate handler
			
 
				+            if method == 'initialize':
			
 
				+                result = self.handle_initialize(params)
			
 
				+            elif method == 'tools/list':
			
 
				+                result = self.handle_tools_list(params)
			
 
				+            elif method == 'tools/call':
			
 
				+                result = self.handle_tools_call(params)
			
 
				+            else:
			
 
				+                # Method not found
			
 
				+                return {
			
 
				+                    "jsonrpc": "2.0",
			
 
				+                    "id": request_id,
			
 
				+                    "error": {
			
 
				+                        "code": -32601,
			
 
				+                        "message": f"Method not found: {method}"
			
 
				+                    }
			
 
				+                }
			
 
				+
			
 
				+            # Success response
			
 
				+            return {
			
 
				+                "jsonrpc": "2.0",
			
 
				+                "id": request_id,
			
 
				+                "result": result
			
 
				+            }
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            # Error response
			
 
				+            error_trace = traceback.format_exc()
			
 
				+            return {
			
 
				+                "jsonrpc": "2.0",
			
 
				+                "id": request_id,
			
 
				+                "error": {
			
 
				+                    "code": -32603,
			
 
				+                    "message": str(e),
			
 
				+                    "data": error_trace
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+    def run_stdio_server(self):
			
 
				+        """
			
 
				+        Run the MCP server in stdio mode.
			
 
				+
			
 
				+        Reads JSON-RPC requests from stdin (one per line),
			
 
				+        writes JSON-RPC responses to stdout (one per line).
			
 
				+        """
			
 
				+
			
 
				+        # Read requests from stdin line by line
			
 
				+        for line in sys.stdin:
			
 
				+            line = line.strip()
			
 
				+            if not line:
			
 
				+                continue
			
 
				+
			
 
				+            try:
			
 
				+                # Parse JSON-RPC request
			
 
				+                request = json.loads(line)
			
 
				+
			
 
				+                # Handle request
			
 
				+                response = self.handle_request(request)
			
 
				+
			
 
				+                # Write response to stdout (use custom encoder for Click types)
			
 
				+                print(json.dumps(response, cls=MCPJSONEncoder), flush=True)
			
 
				+
			
 
				+            except json.JSONDecodeError as e:
			
 
				+                # Invalid JSON
			
 
				+                error_response = {
			
 
				+                    "jsonrpc": "2.0",
			
 
				+                    "id": None,
			
 
				+                    "error": {
			
 
				+                        "code": -32700,
			
 
				+                        "message": "Parse error",
			
 
				+                        "data": str(e)
			
 
				+                    }
			
 
				+                }
			
 
				+                print(json.dumps(error_response, cls=MCPJSONEncoder), flush=True)
			
 
				+
			
 
				+
			
 
				+def run_mcp_server():
			
 
				+    """Main entry point for MCP server"""
			
 
				+    server = MCPServer()
			
 
				+    server.run_stdio_server()