1 year ago · eeb2671e4d
--- a/archivebox/api/v1_api.py
+++ b/archivebox/api/v1_api.py
@@ -37,9 +37,9 @@ html_description=f'''
 
															 def register_urls(api: NinjaAPI) -> NinjaAPI:
														
 
															-    api.add_router('/auth/',     'api.v1_auth.router')
														
 
															+    # api.add_router('/auth/',     'api.v1_auth.router')
														
 
															     api.add_router('/core/',     'api.v1_core.router')
														
 
															-    api.add_router('/crawls/',   'api.v1_core.router')
														
 
															+    api.add_router('/crawls/',   'api.v1_crawls.router')
														
 
															     api.add_router('/cli/',      'api.v1_cli.router')
														
 
															     api.add_router('/jobs/',     'api.v1_actors.router')
														
 
															     return api
														
@@ -83,7 +83,7 @@ class NinjaAPIWithIOCapture(NinjaAPI):
 
															 api = NinjaAPIWithIOCapture(
														
 
															     title='ArchiveBox API',
														
 
															     description=html_description,
														
 
															-    version='1.0.0',
														
 
															+    version=VERSION,
														
 
															     csrf=False,
														
 
															     auth=API_AUTH_METHODS,
														
 
															     urls_namespace="api-1",
														
--- a/archivebox/api/v1_cli.py
+++ b/archivebox/api/v1_cli.py
@@ -17,10 +17,10 @@ from archivebox.misc.util import ansi_to_html
 
															 from archivebox.config.common import ARCHIVING_CONFIG
														
 
															-from .auth import API_AUTH_METHODS
														
 
															+# from .auth import API_AUTH_METHODS
														
 
															 # router for API that exposes archivebox cli subcommands as REST endpoints
														
 
															-router = Router(tags=['ArchiveBox CLI Sub-Commands'], auth=API_AUTH_METHODS)
														
 
															+router = Router(tags=['ArchiveBox CLI Sub-Commands'])
														
 
															 # Schemas
														
--- a/archivebox/api/v1_core.py
+++ b/archivebox/api/v1_core.py
@@ -16,12 +16,13 @@ from ninja.errors import HttpError
 
															 from core.models import Snapshot, ArchiveResult, Tag
														
 
															 from api.models import APIToken, OutboundWebhook
														
 
															-from crawls.models import Crawl
														
 
															-from seeds.models import Seed
														
 
															+from api.v1_crawls import CrawlSchema, SeedSchema
														
 
															-from .auth import API_AUTH_METHODS
														
 
															+# from .auth import API_AUTH_METHODS
														
 
															-router = Router(tags=['Core Models'], auth=API_AUTH_METHODS)
														
 
															+
														
 
															+
														
 
															+router = Router(tags=['Core Models'])
														
@@ -397,108 +398,6 @@ def get_tag(request, tag_id: str, with_snapshots: bool=True):
 
															         pass
														
 
															     return tag
														
 
															-
														
 
															-
														
 
															-class SeedSchema(Schema):
														
 
															-    TYPE: str = 'seeds.models.Seed'
														
 
															-
														
 
															-    id: UUID
														
 
															-    abid: str
														
 
															-    
														
 
															-    modified_at: datetime
														
 
															-    created_at: datetime
														
 
															-    created_by_id: str
														
 
															-    created_by_username: str
														
 
															-    
														
 
															-    uri: str
														
 
															-    tags_str: str
														
 
															-    config: dict
														
 
															-    
														
 
															-    @staticmethod
														
 
															-    def resolve_created_by_id(obj):
														
 
															-        return str(obj.created_by_id)
														
 
															-    
														
 
															-    @staticmethod
														
 
															-    def resolve_created_by_username(obj):
														
 
															-        User = get_user_model()
														
 
															-        return User.objects.get(id=obj.created_by_id).username
														
 
															-    
														
 
															[email protected]("/seeds", response=List[SeedSchema], url_name="get_seeds")
														
 
															-def get_seeds(request):
														
 
															-    return Seed.objects.all().distinct()
														
 
															-
														
 
															[email protected]("/seed/{seed_id}", response=SeedSchema, url_name="get_seed")
														
 
															-def get_seed(request, seed_id: str):
														
 
															-    seed = None
														
 
															-    request.with_snapshots = False
														
 
															-    request.with_archiveresults = False
														
 
															-    
														
 
															-    try:
														
 
															-        seed = Seed.objects.get(Q(abid__icontains=seed_id) | Q(id__icontains=seed_id))
														
 
															-    except Exception:
														
 
															-        pass
														
 
															-    return seed
														
 
															-
														
 
															-
														
 
															-class CrawlSchema(Schema):
														
 
															-    TYPE: str = 'core.models.Crawl'
														
 
															-
														
 
															-    id: UUID
														
 
															-    abid: str
														
 
															-
														
 
															-    modified_at: datetime
														
 
															-    created_at: datetime
														
 
															-    created_by_id: str
														
 
															-    created_by_username: str
														
 
															-    
														
 
															-    status: str
														
 
															-    retry_at: datetime | None
														
 
															-
														
 
															-    seed: SeedSchema
														
 
															-    max_depth: int
														
 
															-    
														
 
															-    # snapshots: List[SnapshotSchema]
														
 
															-
														
 
															-    @staticmethod
														
 
															-    def resolve_created_by_id(obj):
														
 
															-        return str(obj.created_by_id)
														
 
															-    
														
 
															-    @staticmethod
														
 
															-    def resolve_created_by_username(obj):
														
 
															-        User = get_user_model()
														
 
															-        return User.objects.get(id=obj.created_by_id).username
														
 
															-    
														
 
															-    @staticmethod
														
 
															-    def resolve_snapshots(obj, context):
														
 
															-        if context['request'].with_snapshots:
														
 
															-            return obj.snapshot_set.all().distinct()
														
 
															-        return Snapshot.objects.none()
														
 
															-
														
 
															-
														
 
															[email protected]("/crawls", response=List[CrawlSchema], url_name="get_crawls")
														
 
															-def get_crawls(request):
														
 
															-    return Crawl.objects.all().distinct()
														
 
															-
														
 
															[email protected]("/crawl/{crawl_id}", response=CrawlSchema, url_name="get_crawl")
														
 
															-def get_crawl(request, crawl_id: str, with_snapshots: bool=False, with_archiveresults: bool=False):
														
 
															-    """Get a specific Crawl by id or abid."""
														
 
															-    
														
 
															-    crawl = None
														
 
															-    request.with_snapshots = with_snapshots
														
 
															-    request.with_archiveresults = with_archiveresults
														
 
															-    
														
 
															-    try:
														
 
															-        crawl = Crawl.objects.get(abid__icontains=crawl_id)
														
 
															-    except Exception:
														
 
															-        pass
														
 
															-
														
 
															-    try:
														
 
															-        crawl = crawl or Crawl.objects.get(id__icontains=crawl_id)
														
 
															-    except Exception:
														
 
															-        pass
														
 
															-    return crawl
														
 
															-
														
 
															-
														
 
															 @router.get("/any/{abid}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, SeedSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ABID or ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.)")
														
 
															 def get_any(request, abid: str):
														
 
															     """Get any object by its ABID or ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.)."""
														
@@ -529,11 +428,13 @@ def get_any(request, abid: str):
 
															         pass
														
 
															     try:
														
 
															+        from api.v1_crawls import get_seed
														
 
															         response = response or get_seed(request, abid)
														
 
															     except Exception:
														
 
															         pass
														
 
															     try:
														
 
															+        from api.v1_crawls import get_crawl
														
 
															         response = response or get_crawl(request, abid)
														
 
															     except Exception:
														
 
															         pass
														
--- a/archivebox/api/v1_crawls.py
+++ b/archivebox/api/v1_crawls.py
@@ -0,0 +1,119 @@
 
															+__package__ = 'archivebox.api'
														
 
															+
														
 
															+from uuid import UUID
														
 
															+from typing import List
														
 
															+from datetime import datetime
														
 
															+
														
 
															+from django.db.models import Q
														
 
															+from django.contrib.auth import get_user_model
														
 
															+
														
 
															+from ninja import Router, Schema
														
 
															+
														
 
															+from core.models import Snapshot
														
 
															+from crawls.models import Crawl
														
 
															+from seeds.models import Seed
														
 
															+
														
 
															+from .auth import API_AUTH_METHODS
														
 
															+
														
 
															+router = Router(tags=['Crawl Models'], auth=API_AUTH_METHODS)
														
 
															+
														
 
															+
														
 
															+class SeedSchema(Schema):
														
 
															+    TYPE: str = 'seeds.models.Seed'
														
 
															+
														
 
															+    id: UUID
														
 
															+    abid: str
														
 
															+    
														
 
															+    modified_at: datetime
														
 
															+    created_at: datetime
														
 
															+    created_by_id: str
														
 
															+    created_by_username: str
														
 
															+    
														
 
															+    uri: str
														
 
															+    tags_str: str
														
 
															+    config: dict
														
 
															+    
														
 
															+    @staticmethod
														
 
															+    def resolve_created_by_id(obj):
														
 
															+        return str(obj.created_by_id)
														
 
															+    
														
 
															+    @staticmethod
														
 
															+    def resolve_created_by_username(obj):
														
 
															+        User = get_user_model()
														
 
															+        return User.objects.get(id=obj.created_by_id).username
														
 
															+    
														
 
															[email protected]("/seeds", response=List[SeedSchema], url_name="get_seeds")
														
 
															+def get_seeds(request):
														
 
															+    return Seed.objects.all().distinct()
														
 
															+
														
 
															[email protected]("/seed/{seed_id}", response=SeedSchema, url_name="get_seed")
														
 
															+def get_seed(request, seed_id: str):
														
 
															+    seed = None
														
 
															+    request.with_snapshots = False
														
 
															+    request.with_archiveresults = False
														
 
															+    
														
 
															+    try:
														
 
															+        seed = Seed.objects.get(Q(abid__icontains=seed_id) | Q(id__icontains=seed_id))
														
 
															+    except Exception:
														
 
															+        pass
														
 
															+    return seed
														
 
															+
														
 
															+
														
 
															+class CrawlSchema(Schema):
														
 
															+    TYPE: str = 'core.models.Crawl'
														
 
															+
														
 
															+    id: UUID
														
 
															+    abid: str
														
 
															+
														
 
															+    modified_at: datetime
														
 
															+    created_at: datetime
														
 
															+    created_by_id: str
														
 
															+    created_by_username: str
														
 
															+    
														
 
															+    status: str
														
 
															+    retry_at: datetime | None
														
 
															+
														
 
															+    seed: SeedSchema
														
 
															+    max_depth: int
														
 
															+    
														
 
															+    # snapshots: List[SnapshotSchema]
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def resolve_created_by_id(obj):
														
 
															+        return str(obj.created_by_id)
														
 
															+    
														
 
															+    @staticmethod
														
 
															+    def resolve_created_by_username(obj):
														
 
															+        User = get_user_model()
														
 
															+        return User.objects.get(id=obj.created_by_id).username
														
 
															+    
														
 
															+    @staticmethod
														
 
															+    def resolve_snapshots(obj, context):
														
 
															+        if context['request'].with_snapshots:
														
 
															+            return obj.snapshot_set.all().distinct()
														
 
															+        return Snapshot.objects.none()
														
 
															+
														
 
															+
														
 
															[email protected]("/crawls", response=List[CrawlSchema], url_name="get_crawls")
														
 
															+def get_crawls(request):
														
 
															+    return Crawl.objects.all().distinct()
														
 
															+
														
 
															[email protected]("/crawl/{crawl_id}", response=CrawlSchema, url_name="get_crawl")
														
 
															+def get_crawl(request, crawl_id: str, with_snapshots: bool=False, with_archiveresults: bool=False):
														
 
															+    """Get a specific Crawl by id or abid."""
														
 
															+    
														
 
															+    crawl = None
														
 
															+    request.with_snapshots = with_snapshots
														
 
															+    request.with_archiveresults = with_archiveresults
														
 
															+    
														
 
															+    try:
														
 
															+        crawl = Crawl.objects.get(abid__icontains=crawl_id)
														
 
															+    except Exception:
														
 
															+        pass
														
 
															+
														
 
															+    try:
														
 
															+        crawl = crawl or Crawl.objects.get(id__icontains=crawl_id)
														
 
															+    except Exception:
														
 
															+        pass
														
 
															+    return crawl
														
 
															+
														
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -40,6 +40,7 @@ def add(urls: str | list[str],
 
															         extractors: str="",
														
 
															         parser: str="auto",
														
 
															         persona: str='Default',
														
 
															+        bg: bool=False,
														
 
															         created_by_id: int | None=None) -> QuerySet['Snapshot']:
														
 
															     """Add a new URL or list of URLs to your archive"""
														
@@ -51,7 +52,6 @@ def add(urls: str | list[str],
 
															     setup_django()
														
 
															     check_data_folder()
														
 
															-    
														
 
															     from seeds.models import Seed
														
 
															     from crawls.models import Crawl
														
 
															     from actors.orchestrator import Orchestrator
														
@@ -83,8 +83,9 @@ def add(urls: str | list[str],
 
															     # from crawls.actors import CrawlActor
														
 
															     # from core.actors import SnapshotActor, ArchiveResultActor
														
 
															-    orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=2)
														
 
															-    orchestrator.start()
														
 
															+    if not bg:
														
 
															+        orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4)
														
 
															+        orchestrator.start()
														
 
															     # 5. return the list of new Snapshots created
														
 
															     return crawl.snapshot_set.all()
														
@@ -169,6 +170,12 @@ def main(args: list[str] | None=None, stdin: IO | None=None, pwd: str | None=Non
 
															         help="Name of accounts persona to use when archiving.",
														
 
															         default="Default",
														
 
															     )
														
 
															+    parser.add_argument(
														
 
															+        "--bg",
														
 
															+        default=False,
														
 
															+        action="store_true",
														
 
															+        help="Enqueue a background worker to complete the crawl instead of running it immediately",
														
 
															+    )
														
 
															     command = parser.parse_args(args or ())
														
 
															     urls = command.urls
														
@@ -193,6 +200,7 @@ def main(args: list[str] | None=None, stdin: IO | None=None, pwd: str | None=Non
 
															         extractors=command.extract,
														
 
															         parser=command.parser,
														
 
															         persona=command.persona,
														
 
															+        bg=command.bg,
														
 
															     )
														
--- a/archivebox/core/urls.py
+++ b/archivebox/core/urls.py
@@ -23,30 +23,32 @@ urlpatterns = [
 
															     re_path(r"^static/(?P<path>.*)$", serve_static),
														
 
															     # re_path(r"^media/(?P<path>.*)$", static.serve, {"document_root": settings.MEDIA_ROOT}),
														
 
															-    path('health/', HealthCheckView.as_view(), name='healthcheck'),
														
 
															-    path('error/', lambda *_: 1/0),                                             # type: ignore
														
 
															     path('robots.txt', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'robots.txt'}),
														
 
															     path('favicon.ico', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'favicon.ico'}),
														
 
															-    path('accounts/login/', RedirectView.as_view(url='/admin/login/')),
														
 
															-    path('accounts/logout/', RedirectView.as_view(url='/admin/logout/')),
														
 
															-
														
 
															-    path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')),
														
 
															     path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'),
														
 
															-    path('archive/', RedirectView.as_view(url='/')),
														
 
															-    
														
 
															-    path('accounts/', include('django.contrib.auth.urls')),
														
 
															-    path('admin/', archivebox_admin.urls),
														
 
															-    path("api/",      include('api.urls'), name='api'),
														
 
															     path('public/', PublicIndexView.as_view(), name='public-index'),
														
 
															+    path('archive/', RedirectView.as_view(url='/')),
														
 
															     path('archive/<path:path>', SnapshotView.as_view(), name='Snapshot'),
														
 
															+    path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')),
														
 
															     path('add/', AddView.as_view(), name='add'),
														
 
															     path("jobs/",     JobsDashboardView.as_view(), name='jobs_dashboard'),
														
 
															+    path('accounts/login/', RedirectView.as_view(url='/admin/login/')),
														
 
															+    path('accounts/logout/', RedirectView.as_view(url='/admin/logout/')),
														
 
															+
														
 
															+
														
 
															+    path('accounts/', include('django.contrib.auth.urls')),
														
 
															+    path('admin/', archivebox_admin.urls),
														
 
															+    
														
 
															+    path("api/",      include('api.urls'), name='api'),
														
 
															+
														
 
															+    path('health/', HealthCheckView.as_view(), name='healthcheck'),
														
 
															+    path('error/', lambda *_: 1/0),                                             # type: ignore
														
 
															     # path('jet_api/', include('jet_django.urls')),  Enable to use https://www.jetadmin.io/integrations/django
														
--- a/archivebox/pkgs/abx-spec-config/abx_spec_config/base_configset.py
+++ b/archivebox/pkgs/abx-spec-config/abx_spec_config/base_configset.py
@@ -185,7 +185,7 @@ class BaseConfigSet(BaseSettings):
 
															             return computed_default
														
 
															         return value
														
 
															-    def update_in_place(self, warn=True, persist=False, hint='', **kwargs):
														
 
															+    def update_in_place(self, warn=False, persist=False, hint='', **kwargs):
														
 
															         """
														
 
															         Update the config with new values. Use this sparingly! We should almost never be updating config at runtime.
														
 
															         Sets them in the environment so they propagate to spawned subprocesses / across future re-__init__()s and reload from environment
														
@@ -201,7 +201,7 @@ class BaseConfigSet(BaseSettings):
 
															         if all(key in _ALREADY_WARNED_ABOUT_UPDATED_CONFIG for key in kwargs.keys()):
														
 
															             warn = False
														
 
															-        if warn:
														
 
															+        if warn or os.environ.get('DEBUG', '').lower() in ('true', '1', 'yes', 'on'):
														
 
															             fix_scope = 'in ArchiveBox.conf' if persist else 'just for current run'
														
 
															             print(f'\n[yellow]:warning:  WARNING: Some config cannot be used as-is, fixing automatically {fix_scope}:[/yellow] {hint}', file=sys.stderr)