1 year ago · eeb2671e4d
--- a/archivebox/api/v1_api.py
+++ b/archivebox/api/v1_api.py
@@ -37,9 +37,9 @@ html_description=f'''
 
				 
			
 
				 
			
 
				 def register_urls(api: NinjaAPI) -> NinjaAPI:
			
 
				-    api.add_router('/auth/',     'api.v1_auth.router')
			
 
				+    # api.add_router('/auth/',     'api.v1_auth.router')
			
 
				     api.add_router('/core/',     'api.v1_core.router')
			
 
				-    api.add_router('/crawls/',   'api.v1_core.router')
			
 
				+    api.add_router('/crawls/',   'api.v1_crawls.router')
			
 
				     api.add_router('/cli/',      'api.v1_cli.router')
			
 
				     api.add_router('/jobs/',     'api.v1_actors.router')
			
 
				     return api
			
@@ -83,7 +83,7 @@ class NinjaAPIWithIOCapture(NinjaAPI):
 
				 api = NinjaAPIWithIOCapture(
			
 
				     title='ArchiveBox API',
			
 
				     description=html_description,
			
 
				-    version='1.0.0',
			
 
				+    version=VERSION,
			
 
				     csrf=False,
			
 
				     auth=API_AUTH_METHODS,
			
 
				     urls_namespace="api-1",
			
--- a/archivebox/api/v1_cli.py
+++ b/archivebox/api/v1_cli.py
@@ -17,10 +17,10 @@ from archivebox.misc.util import ansi_to_html
 
				 from archivebox.config.common import ARCHIVING_CONFIG
			
 
				 
			
 
				 
			
 
				-from .auth import API_AUTH_METHODS
			
 
				+# from .auth import API_AUTH_METHODS
			
 
				 
			
 
				 # router for API that exposes archivebox cli subcommands as REST endpoints
			
 
				-router = Router(tags=['ArchiveBox CLI Sub-Commands'], auth=API_AUTH_METHODS)
			
 
				+router = Router(tags=['ArchiveBox CLI Sub-Commands'])
			
 
				 
			
 
				 
			
 
				 # Schemas
			
--- a/archivebox/api/v1_core.py
+++ b/archivebox/api/v1_core.py
@@ -16,12 +16,13 @@ from ninja.errors import HttpError
 
				 
			
 
				 from core.models import Snapshot, ArchiveResult, Tag
			
 
				 from api.models import APIToken, OutboundWebhook
			
 
				-from crawls.models import Crawl
			
 
				-from seeds.models import Seed
			
 
				+from api.v1_crawls import CrawlSchema, SeedSchema
			
 
				 
			
 
				-from .auth import API_AUTH_METHODS
			
 
				+# from .auth import API_AUTH_METHODS
			
 
				 
			
 
				-router = Router(tags=['Core Models'], auth=API_AUTH_METHODS)
			
 
				+
			
 
				+
			
 
				+router = Router(tags=['Core Models'])
			
 
				 
			
 
				 
			
 
				 
			
@@ -397,108 +398,6 @@ def get_tag(request, tag_id: str, with_snapshots: bool=True):
 
				         pass
			
 
				     return tag
			
 
				 
			
 
				-
			
 
				-
			
 
				-class SeedSchema(Schema):
			
 
				-    TYPE: str = 'seeds.models.Seed'
			
 
				-
			
 
				-    id: UUID
			
 
				-    abid: str
			
 
				-    
			
 
				-    modified_at: datetime
			
 
				-    created_at: datetime
			
 
				-    created_by_id: str
			
 
				-    created_by_username: str
			
 
				-    
			
 
				-    uri: str
			
 
				-    tags_str: str
			
 
				-    config: dict
			
 
				-    
			
 
				-    @staticmethod
			
 
				-    def resolve_created_by_id(obj):
			
 
				-        return str(obj.created_by_id)
			
 
				-    
			
 
				-    @staticmethod
			
 
				-    def resolve_created_by_username(obj):
			
 
				-        User = get_user_model()
			
 
				-        return User.objects.get(id=obj.created_by_id).username
			
 
				-    
			
 
				[email protected]("/seeds", response=List[SeedSchema], url_name="get_seeds")
			
 
				-def get_seeds(request):
			
 
				-    return Seed.objects.all().distinct()
			
 
				-
			
 
				[email protected]("/seed/{seed_id}", response=SeedSchema, url_name="get_seed")
			
 
				-def get_seed(request, seed_id: str):
			
 
				-    seed = None
			
 
				-    request.with_snapshots = False
			
 
				-    request.with_archiveresults = False
			
 
				-    
			
 
				-    try:
			
 
				-        seed = Seed.objects.get(Q(abid__icontains=seed_id) | Q(id__icontains=seed_id))
			
 
				-    except Exception:
			
 
				-        pass
			
 
				-    return seed
			
 
				-
			
 
				-
			
 
				-class CrawlSchema(Schema):
			
 
				-    TYPE: str = 'core.models.Crawl'
			
 
				-
			
 
				-    id: UUID
			
 
				-    abid: str
			
 
				-
			
 
				-    modified_at: datetime
			
 
				-    created_at: datetime
			
 
				-    created_by_id: str
			
 
				-    created_by_username: str
			
 
				-    
			
 
				-    status: str
			
 
				-    retry_at: datetime | None
			
 
				-
			
 
				-    seed: SeedSchema
			
 
				-    max_depth: int
			
 
				-    
			
 
				-    # snapshots: List[SnapshotSchema]
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def resolve_created_by_id(obj):
			
 
				-        return str(obj.created_by_id)
			
 
				-    
			
 
				-    @staticmethod
			
 
				-    def resolve_created_by_username(obj):
			
 
				-        User = get_user_model()
			
 
				-        return User.objects.get(id=obj.created_by_id).username
			
 
				-    
			
 
				-    @staticmethod
			
 
				-    def resolve_snapshots(obj, context):
			
 
				-        if context['request'].with_snapshots:
			
 
				-            return obj.snapshot_set.all().distinct()
			
 
				-        return Snapshot.objects.none()
			
 
				-
			
 
				-
			
 
				[email protected]("/crawls", response=List[CrawlSchema], url_name="get_crawls")
			
 
				-def get_crawls(request):
			
 
				-    return Crawl.objects.all().distinct()
			
 
				-
			
 
				[email protected]("/crawl/{crawl_id}", response=CrawlSchema, url_name="get_crawl")
			
 
				-def get_crawl(request, crawl_id: str, with_snapshots: bool=False, with_archiveresults: bool=False):
			
 
				-    """Get a specific Crawl by id or abid."""
			
 
				-    
			
 
				-    crawl = None
			
 
				-    request.with_snapshots = with_snapshots
			
 
				-    request.with_archiveresults = with_archiveresults
			
 
				-    
			
 
				-    try:
			
 
				-        crawl = Crawl.objects.get(abid__icontains=crawl_id)
			
 
				-    except Exception:
			
 
				-        pass
			
 
				-
			
 
				-    try:
			
 
				-        crawl = crawl or Crawl.objects.get(id__icontains=crawl_id)
			
 
				-    except Exception:
			
 
				-        pass
			
 
				-    return crawl
			
 
				-
			
 
				-
			
 
				 @router.get("/any/{abid}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, SeedSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ABID or ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.)")
			
 
				 def get_any(request, abid: str):
			
 
				     """Get any object by its ABID or ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.)."""
			
@@ -529,11 +428,13 @@ def get_any(request, abid: str):
 
				         pass
			
 
				     
			
 
				     try:
			
 
				+        from api.v1_crawls import get_seed
			
 
				         response = response or get_seed(request, abid)
			
 
				     except Exception:
			
 
				         pass
			
 
				     
			
 
				     try:
			
 
				+        from api.v1_crawls import get_crawl
			
 
				         response = response or get_crawl(request, abid)
			
 
				     except Exception:
			
 
				         pass
			
--- a/archivebox/api/v1_crawls.py
+++ b/archivebox/api/v1_crawls.py
@@ -0,0 +1,119 @@
 
				+__package__ = 'archivebox.api'
			
 
				+
			
 
				+from uuid import UUID
			
 
				+from typing import List
			
 
				+from datetime import datetime
			
 
				+
			
 
				+from django.db.models import Q
			
 
				+from django.contrib.auth import get_user_model
			
 
				+
			
 
				+from ninja import Router, Schema
			
 
				+
			
 
				+from core.models import Snapshot
			
 
				+from crawls.models import Crawl
			
 
				+from seeds.models import Seed
			
 
				+
			
 
				+from .auth import API_AUTH_METHODS
			
 
				+
			
 
				+router = Router(tags=['Crawl Models'], auth=API_AUTH_METHODS)
			
 
				+
			
 
				+
			
 
				+class SeedSchema(Schema):
			
 
				+    TYPE: str = 'seeds.models.Seed'
			
 
				+
			
 
				+    id: UUID
			
 
				+    abid: str
			
 
				+    
			
 
				+    modified_at: datetime
			
 
				+    created_at: datetime
			
 
				+    created_by_id: str
			
 
				+    created_by_username: str
			
 
				+    
			
 
				+    uri: str
			
 
				+    tags_str: str
			
 
				+    config: dict
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def resolve_created_by_id(obj):
			
 
				+        return str(obj.created_by_id)
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def resolve_created_by_username(obj):
			
 
				+        User = get_user_model()
			
 
				+        return User.objects.get(id=obj.created_by_id).username
			
 
				+    
			
 
				[email protected]("/seeds", response=List[SeedSchema], url_name="get_seeds")
			
 
				+def get_seeds(request):
			
 
				+    return Seed.objects.all().distinct()
			
 
				+
			
 
				[email protected]("/seed/{seed_id}", response=SeedSchema, url_name="get_seed")
			
 
				+def get_seed(request, seed_id: str):
			
 
				+    seed = None
			
 
				+    request.with_snapshots = False
			
 
				+    request.with_archiveresults = False
			
 
				+    
			
 
				+    try:
			
 
				+        seed = Seed.objects.get(Q(abid__icontains=seed_id) | Q(id__icontains=seed_id))
			
 
				+    except Exception:
			
 
				+        pass
			
 
				+    return seed
			
 
				+
			
 
				+
			
 
				+class CrawlSchema(Schema):
			
 
				+    TYPE: str = 'core.models.Crawl'
			
 
				+
			
 
				+    id: UUID
			
 
				+    abid: str
			
 
				+
			
 
				+    modified_at: datetime
			
 
				+    created_at: datetime
			
 
				+    created_by_id: str
			
 
				+    created_by_username: str
			
 
				+    
			
 
				+    status: str
			
 
				+    retry_at: datetime | None
			
 
				+
			
 
				+    seed: SeedSchema
			
 
				+    max_depth: int
			
 
				+    
			
 
				+    # snapshots: List[SnapshotSchema]
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def resolve_created_by_id(obj):
			
 
				+        return str(obj.created_by_id)
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def resolve_created_by_username(obj):
			
 
				+        User = get_user_model()
			
 
				+        return User.objects.get(id=obj.created_by_id).username
			
 
				+    
			
 
				+    @staticmethod
			
 
				+    def resolve_snapshots(obj, context):
			
 
				+        if context['request'].with_snapshots:
			
 
				+            return obj.snapshot_set.all().distinct()
			
 
				+        return Snapshot.objects.none()
			
 
				+
			
 
				+
			
 
				[email protected]("/crawls", response=List[CrawlSchema], url_name="get_crawls")
			
 
				+def get_crawls(request):
			
 
				+    return Crawl.objects.all().distinct()
			
 
				+
			
 
				[email protected]("/crawl/{crawl_id}", response=CrawlSchema, url_name="get_crawl")
			
 
				+def get_crawl(request, crawl_id: str, with_snapshots: bool=False, with_archiveresults: bool=False):
			
 
				+    """Get a specific Crawl by id or abid."""
			
 
				+    
			
 
				+    crawl = None
			
 
				+    request.with_snapshots = with_snapshots
			
 
				+    request.with_archiveresults = with_archiveresults
			
 
				+    
			
 
				+    try:
			
 
				+        crawl = Crawl.objects.get(abid__icontains=crawl_id)
			
 
				+    except Exception:
			
 
				+        pass
			
 
				+
			
 
				+    try:
			
 
				+        crawl = crawl or Crawl.objects.get(id__icontains=crawl_id)
			
 
				+    except Exception:
			
 
				+        pass
			
 
				+    return crawl
			
 
				+
			
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -40,6 +40,7 @@ def add(urls: str | list[str],
 
				         extractors: str="",
			
 
				         parser: str="auto",
			
 
				         persona: str='Default',
			
 
				+        bg: bool=False,
			
 
				         created_by_id: int | None=None) -> QuerySet['Snapshot']:
			
 
				     """Add a new URL or list of URLs to your archive"""
			
 
				 
			
@@ -51,7 +52,6 @@ def add(urls: str | list[str],
 
				     setup_django()
			
 
				     check_data_folder()
			
 
				     
			
 
				-    
			
 
				     from seeds.models import Seed
			
 
				     from crawls.models import Crawl
			
 
				     from actors.orchestrator import Orchestrator
			
@@ -83,8 +83,9 @@ def add(urls: str | list[str],
 
				     # from crawls.actors import CrawlActor
			
 
				     # from core.actors import SnapshotActor, ArchiveResultActor
			
 
				 
			
 
				-    orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=2)
			
 
				-    orchestrator.start()
			
 
				+    if not bg:
			
 
				+        orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4)
			
 
				+        orchestrator.start()
			
 
				     
			
 
				     # 5. return the list of new Snapshots created
			
 
				     return crawl.snapshot_set.all()
			
@@ -169,6 +170,12 @@ def main(args: list[str] | None=None, stdin: IO | None=None, pwd: str | None=Non
 
				         help="Name of accounts persona to use when archiving.",
			
 
				         default="Default",
			
 
				     )
			
 
				+    parser.add_argument(
			
 
				+        "--bg",
			
 
				+        default=False,
			
 
				+        action="store_true",
			
 
				+        help="Enqueue a background worker to complete the crawl instead of running it immediately",
			
 
				+    )
			
 
				     command = parser.parse_args(args or ())
			
 
				     urls = command.urls
			
 
				 
			
@@ -193,6 +200,7 @@ def main(args: list[str] | None=None, stdin: IO | None=None, pwd: str | None=Non
 
				         extractors=command.extract,
			
 
				         parser=command.parser,
			
 
				         persona=command.persona,
			
 
				+        bg=command.bg,
			
 
				     )
			
 
				 
			
 
				 
			
--- a/archivebox/core/urls.py
+++ b/archivebox/core/urls.py
@@ -23,30 +23,32 @@ urlpatterns = [
 
				     re_path(r"^static/(?P<path>.*)$", serve_static),
			
 
				     # re_path(r"^media/(?P<path>.*)$", static.serve, {"document_root": settings.MEDIA_ROOT}),
			
 
				 
			
 
				-    path('health/', HealthCheckView.as_view(), name='healthcheck'),
			
 
				-    path('error/', lambda *_: 1/0),                                             # type: ignore
			
 
				     path('robots.txt', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'robots.txt'}),
			
 
				     path('favicon.ico', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'favicon.ico'}),
			
 
				 
			
 
				-    path('accounts/login/', RedirectView.as_view(url='/admin/login/')),
			
 
				-    path('accounts/logout/', RedirectView.as_view(url='/admin/logout/')),
			
 
				-
			
 
				-    path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')),
			
 
				     path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'),
			
 
				-    path('archive/', RedirectView.as_view(url='/')),
			
 
				-    
			
 
				-    path('accounts/', include('django.contrib.auth.urls')),
			
 
				-    path('admin/', archivebox_admin.urls),
			
 
				-    path("api/",      include('api.urls'), name='api'),
			
 
				 
			
 
				     path('public/', PublicIndexView.as_view(), name='public-index'),
			
 
				     
			
 
				+    path('archive/', RedirectView.as_view(url='/')),
			
 
				     path('archive/<path:path>', SnapshotView.as_view(), name='Snapshot'),
			
 
				 
			
 
				+    path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')),
			
 
				     path('add/', AddView.as_view(), name='add'),
			
 
				     
			
 
				     path("jobs/",     JobsDashboardView.as_view(), name='jobs_dashboard'),
			
 
				 
			
 
				+    path('accounts/login/', RedirectView.as_view(url='/admin/login/')),
			
 
				+    path('accounts/logout/', RedirectView.as_view(url='/admin/logout/')),
			
 
				+
			
 
				+
			
 
				+    path('accounts/', include('django.contrib.auth.urls')),
			
 
				+    path('admin/', archivebox_admin.urls),
			
 
				+    
			
 
				+    path("api/",      include('api.urls'), name='api'),
			
 
				+
			
 
				+    path('health/', HealthCheckView.as_view(), name='healthcheck'),
			
 
				+    path('error/', lambda *_: 1/0),                                             # type: ignore
			
 
				 
			
 
				     # path('jet_api/', include('jet_django.urls')),  Enable to use https://www.jetadmin.io/integrations/django
			
 
				 
			
--- a/archivebox/pkgs/abx-spec-config/abx_spec_config/base_configset.py
+++ b/archivebox/pkgs/abx-spec-config/abx_spec_config/base_configset.py
@@ -185,7 +185,7 @@ class BaseConfigSet(BaseSettings):
 
				             return computed_default
			
 
				         return value
			
 
				     
			
 
				-    def update_in_place(self, warn=True, persist=False, hint='', **kwargs):
			
 
				+    def update_in_place(self, warn=False, persist=False, hint='', **kwargs):
			
 
				         """
			
 
				         Update the config with new values. Use this sparingly! We should almost never be updating config at runtime.
			
 
				         Sets them in the environment so they propagate to spawned subprocesses / across future re-__init__()s and reload from environment
			
@@ -201,7 +201,7 @@ class BaseConfigSet(BaseSettings):
 
				         if all(key in _ALREADY_WARNED_ABOUT_UPDATED_CONFIG for key in kwargs.keys()):
			
 
				             warn = False
			
 
				         
			
 
				-        if warn:
			
 
				+        if warn or os.environ.get('DEBUG', '').lower() in ('true', '1', 'yes', 'on'):
			
 
				             fix_scope = 'in ArchiveBox.conf' if persist else 'just for current run'
			
 
				             print(f'\n[yellow]:warning:  WARNING: Some config cannot be used as-is, fixing automatically {fix_scope}:[/yellow] {hint}', file=sys.stderr)