Browse Source

API improvements

Nick Sweeting 1 year ago
parent
commit
eeb2671e4d

+ 3 - 3
archivebox/api/v1_api.py

@@ -37,9 +37,9 @@ html_description=f'''
 
 
 
 
 def register_urls(api: NinjaAPI) -> NinjaAPI:
 def register_urls(api: NinjaAPI) -> NinjaAPI:
-    api.add_router('/auth/',     'api.v1_auth.router')
+    # api.add_router('/auth/',     'api.v1_auth.router')
     api.add_router('/core/',     'api.v1_core.router')
     api.add_router('/core/',     'api.v1_core.router')
-    api.add_router('/crawls/',   'api.v1_core.router')
+    api.add_router('/crawls/',   'api.v1_crawls.router')
     api.add_router('/cli/',      'api.v1_cli.router')
     api.add_router('/cli/',      'api.v1_cli.router')
     api.add_router('/jobs/',     'api.v1_actors.router')
     api.add_router('/jobs/',     'api.v1_actors.router')
     return api
     return api
@@ -83,7 +83,7 @@ class NinjaAPIWithIOCapture(NinjaAPI):
 api = NinjaAPIWithIOCapture(
 api = NinjaAPIWithIOCapture(
     title='ArchiveBox API',
     title='ArchiveBox API',
     description=html_description,
     description=html_description,
-    version='1.0.0',
+    version=VERSION,
     csrf=False,
     csrf=False,
     auth=API_AUTH_METHODS,
     auth=API_AUTH_METHODS,
     urls_namespace="api-1",
     urls_namespace="api-1",

+ 2 - 2
archivebox/api/v1_cli.py

@@ -17,10 +17,10 @@ from archivebox.misc.util import ansi_to_html
 from archivebox.config.common import ARCHIVING_CONFIG
 from archivebox.config.common import ARCHIVING_CONFIG
 
 
 
 
-from .auth import API_AUTH_METHODS
+# from .auth import API_AUTH_METHODS
 
 
 # router for API that exposes archivebox cli subcommands as REST endpoints
 # router for API that exposes archivebox cli subcommands as REST endpoints
-router = Router(tags=['ArchiveBox CLI Sub-Commands'], auth=API_AUTH_METHODS)
+router = Router(tags=['ArchiveBox CLI Sub-Commands'])
 
 
 
 
 # Schemas
 # Schemas

+ 7 - 106
archivebox/api/v1_core.py

@@ -16,12 +16,13 @@ from ninja.errors import HttpError
 
 
 from core.models import Snapshot, ArchiveResult, Tag
 from core.models import Snapshot, ArchiveResult, Tag
 from api.models import APIToken, OutboundWebhook
 from api.models import APIToken, OutboundWebhook
-from crawls.models import Crawl
-from seeds.models import Seed
+from api.v1_crawls import CrawlSchema, SeedSchema
 
 
-from .auth import API_AUTH_METHODS
+# from .auth import API_AUTH_METHODS
 
 
-router = Router(tags=['Core Models'], auth=API_AUTH_METHODS)
+
+
+router = Router(tags=['Core Models'])
 
 
 
 
 
 
@@ -397,108 +398,6 @@ def get_tag(request, tag_id: str, with_snapshots: bool=True):
         pass
         pass
     return tag
     return tag
 
 
-
-
-class SeedSchema(Schema):
-    TYPE: str = 'seeds.models.Seed'
-
-    id: UUID
-    abid: str
-    
-    modified_at: datetime
-    created_at: datetime
-    created_by_id: str
-    created_by_username: str
-    
-    uri: str
-    tags_str: str
-    config: dict
-    
-    @staticmethod
-    def resolve_created_by_id(obj):
-        return str(obj.created_by_id)
-    
-    @staticmethod
-    def resolve_created_by_username(obj):
-        User = get_user_model()
-        return User.objects.get(id=obj.created_by_id).username
-    
[email protected]("/seeds", response=List[SeedSchema], url_name="get_seeds")
-def get_seeds(request):
-    return Seed.objects.all().distinct()
-
[email protected]("/seed/{seed_id}", response=SeedSchema, url_name="get_seed")
-def get_seed(request, seed_id: str):
-    seed = None
-    request.with_snapshots = False
-    request.with_archiveresults = False
-    
-    try:
-        seed = Seed.objects.get(Q(abid__icontains=seed_id) | Q(id__icontains=seed_id))
-    except Exception:
-        pass
-    return seed
-
-
-class CrawlSchema(Schema):
-    TYPE: str = 'core.models.Crawl'
-
-    id: UUID
-    abid: str
-
-    modified_at: datetime
-    created_at: datetime
-    created_by_id: str
-    created_by_username: str
-    
-    status: str
-    retry_at: datetime | None
-
-    seed: SeedSchema
-    max_depth: int
-    
-    # snapshots: List[SnapshotSchema]
-
-    @staticmethod
-    def resolve_created_by_id(obj):
-        return str(obj.created_by_id)
-    
-    @staticmethod
-    def resolve_created_by_username(obj):
-        User = get_user_model()
-        return User.objects.get(id=obj.created_by_id).username
-    
-    @staticmethod
-    def resolve_snapshots(obj, context):
-        if context['request'].with_snapshots:
-            return obj.snapshot_set.all().distinct()
-        return Snapshot.objects.none()
-
-
[email protected]("/crawls", response=List[CrawlSchema], url_name="get_crawls")
-def get_crawls(request):
-    return Crawl.objects.all().distinct()
-
[email protected]("/crawl/{crawl_id}", response=CrawlSchema, url_name="get_crawl")
-def get_crawl(request, crawl_id: str, with_snapshots: bool=False, with_archiveresults: bool=False):
-    """Get a specific Crawl by id or abid."""
-    
-    crawl = None
-    request.with_snapshots = with_snapshots
-    request.with_archiveresults = with_archiveresults
-    
-    try:
-        crawl = Crawl.objects.get(abid__icontains=crawl_id)
-    except Exception:
-        pass
-
-    try:
-        crawl = crawl or Crawl.objects.get(id__icontains=crawl_id)
-    except Exception:
-        pass
-    return crawl
-
-
 @router.get("/any/{abid}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, SeedSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ABID or ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.)")
 @router.get("/any/{abid}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, SeedSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ABID or ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.)")
 def get_any(request, abid: str):
 def get_any(request, abid: str):
     """Get any object by its ABID or ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.)."""
     """Get any object by its ABID or ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.)."""
@@ -529,11 +428,13 @@ def get_any(request, abid: str):
         pass
         pass
     
     
     try:
     try:
+        from api.v1_crawls import get_seed
         response = response or get_seed(request, abid)
         response = response or get_seed(request, abid)
     except Exception:
     except Exception:
         pass
         pass
     
     
     try:
     try:
+        from api.v1_crawls import get_crawl
         response = response or get_crawl(request, abid)
         response = response or get_crawl(request, abid)
     except Exception:
     except Exception:
         pass
         pass

+ 119 - 0
archivebox/api/v1_crawls.py

@@ -0,0 +1,119 @@
+__package__ = 'archivebox.api'
+
+from uuid import UUID
+from typing import List
+from datetime import datetime
+
+from django.db.models import Q
+from django.contrib.auth import get_user_model
+
+from ninja import Router, Schema
+
+from core.models import Snapshot
+from crawls.models import Crawl
+from seeds.models import Seed
+
+from .auth import API_AUTH_METHODS
+
+router = Router(tags=['Crawl Models'], auth=API_AUTH_METHODS)
+
+
+class SeedSchema(Schema):
+    TYPE: str = 'seeds.models.Seed'
+
+    id: UUID
+    abid: str
+    
+    modified_at: datetime
+    created_at: datetime
+    created_by_id: str
+    created_by_username: str
+    
+    uri: str
+    tags_str: str
+    config: dict
+    
+    @staticmethod
+    def resolve_created_by_id(obj):
+        return str(obj.created_by_id)
+    
+    @staticmethod
+    def resolve_created_by_username(obj):
+        User = get_user_model()
+        return User.objects.get(id=obj.created_by_id).username
+    
[email protected]("/seeds", response=List[SeedSchema], url_name="get_seeds")
+def get_seeds(request):
+    return Seed.objects.all().distinct()
+
[email protected]("/seed/{seed_id}", response=SeedSchema, url_name="get_seed")
+def get_seed(request, seed_id: str):
+    seed = None
+    request.with_snapshots = False
+    request.with_archiveresults = False
+    
+    try:
+        seed = Seed.objects.get(Q(abid__icontains=seed_id) | Q(id__icontains=seed_id))
+    except Exception:
+        pass
+    return seed
+
+
+class CrawlSchema(Schema):
+    TYPE: str = 'core.models.Crawl'
+
+    id: UUID
+    abid: str
+
+    modified_at: datetime
+    created_at: datetime
+    created_by_id: str
+    created_by_username: str
+    
+    status: str
+    retry_at: datetime | None
+
+    seed: SeedSchema
+    max_depth: int
+    
+    # snapshots: List[SnapshotSchema]
+
+    @staticmethod
+    def resolve_created_by_id(obj):
+        return str(obj.created_by_id)
+    
+    @staticmethod
+    def resolve_created_by_username(obj):
+        User = get_user_model()
+        return User.objects.get(id=obj.created_by_id).username
+    
+    @staticmethod
+    def resolve_snapshots(obj, context):
+        if context['request'].with_snapshots:
+            return obj.snapshot_set.all().distinct()
+        return Snapshot.objects.none()
+
+
[email protected]("/crawls", response=List[CrawlSchema], url_name="get_crawls")
+def get_crawls(request):
+    return Crawl.objects.all().distinct()
+
[email protected]("/crawl/{crawl_id}", response=CrawlSchema, url_name="get_crawl")
+def get_crawl(request, crawl_id: str, with_snapshots: bool=False, with_archiveresults: bool=False):
+    """Get a specific Crawl by id or abid."""
+    
+    crawl = None
+    request.with_snapshots = with_snapshots
+    request.with_archiveresults = with_archiveresults
+    
+    try:
+        crawl = Crawl.objects.get(abid__icontains=crawl_id)
+    except Exception:
+        pass
+
+    try:
+        crawl = crawl or Crawl.objects.get(id__icontains=crawl_id)
+    except Exception:
+        pass
+    return crawl
+

+ 11 - 3
archivebox/cli/archivebox_add.py

@@ -40,6 +40,7 @@ def add(urls: str | list[str],
         extractors: str="",
         extractors: str="",
         parser: str="auto",
         parser: str="auto",
         persona: str='Default',
         persona: str='Default',
+        bg: bool=False,
         created_by_id: int | None=None) -> QuerySet['Snapshot']:
         created_by_id: int | None=None) -> QuerySet['Snapshot']:
     """Add a new URL or list of URLs to your archive"""
     """Add a new URL or list of URLs to your archive"""
 
 
@@ -51,7 +52,6 @@ def add(urls: str | list[str],
     setup_django()
     setup_django()
     check_data_folder()
     check_data_folder()
     
     
-    
     from seeds.models import Seed
     from seeds.models import Seed
     from crawls.models import Crawl
     from crawls.models import Crawl
     from actors.orchestrator import Orchestrator
     from actors.orchestrator import Orchestrator
@@ -83,8 +83,9 @@ def add(urls: str | list[str],
     # from crawls.actors import CrawlActor
     # from crawls.actors import CrawlActor
     # from core.actors import SnapshotActor, ArchiveResultActor
     # from core.actors import SnapshotActor, ArchiveResultActor
 
 
-    orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=2)
-    orchestrator.start()
+    if not bg:
+        orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4)
+        orchestrator.start()
     
     
     # 5. return the list of new Snapshots created
     # 5. return the list of new Snapshots created
     return crawl.snapshot_set.all()
     return crawl.snapshot_set.all()
@@ -169,6 +170,12 @@ def main(args: list[str] | None=None, stdin: IO | None=None, pwd: str | None=Non
         help="Name of accounts persona to use when archiving.",
         help="Name of accounts persona to use when archiving.",
         default="Default",
         default="Default",
     )
     )
+    parser.add_argument(
+        "--bg",
+        default=False,
+        action="store_true",
+        help="Enqueue a background worker to complete the crawl instead of running it immediately",
+    )
     command = parser.parse_args(args or ())
     command = parser.parse_args(args or ())
     urls = command.urls
     urls = command.urls
 
 
@@ -193,6 +200,7 @@ def main(args: list[str] | None=None, stdin: IO | None=None, pwd: str | None=Non
         extractors=command.extract,
         extractors=command.extract,
         parser=command.parser,
         parser=command.parser,
         persona=command.persona,
         persona=command.persona,
+        bg=command.bg,
     )
     )
 
 
 
 

+ 13 - 11
archivebox/core/urls.py

@@ -23,30 +23,32 @@ urlpatterns = [
     re_path(r"^static/(?P<path>.*)$", serve_static),
     re_path(r"^static/(?P<path>.*)$", serve_static),
     # re_path(r"^media/(?P<path>.*)$", static.serve, {"document_root": settings.MEDIA_ROOT}),
     # re_path(r"^media/(?P<path>.*)$", static.serve, {"document_root": settings.MEDIA_ROOT}),
 
 
-    path('health/', HealthCheckView.as_view(), name='healthcheck'),
-    path('error/', lambda *_: 1/0),                                             # type: ignore
     path('robots.txt', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'robots.txt'}),
     path('robots.txt', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'robots.txt'}),
     path('favicon.ico', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'favicon.ico'}),
     path('favicon.ico', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'favicon.ico'}),
 
 
-    path('accounts/login/', RedirectView.as_view(url='/admin/login/')),
-    path('accounts/logout/', RedirectView.as_view(url='/admin/logout/')),
-
-    path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')),
     path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'),
     path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'),
-    path('archive/', RedirectView.as_view(url='/')),
-    
-    path('accounts/', include('django.contrib.auth.urls')),
-    path('admin/', archivebox_admin.urls),
-    path("api/",      include('api.urls'), name='api'),
 
 
     path('public/', PublicIndexView.as_view(), name='public-index'),
     path('public/', PublicIndexView.as_view(), name='public-index'),
     
     
+    path('archive/', RedirectView.as_view(url='/')),
     path('archive/<path:path>', SnapshotView.as_view(), name='Snapshot'),
     path('archive/<path:path>', SnapshotView.as_view(), name='Snapshot'),
 
 
+    path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')),
     path('add/', AddView.as_view(), name='add'),
     path('add/', AddView.as_view(), name='add'),
     
     
     path("jobs/",     JobsDashboardView.as_view(), name='jobs_dashboard'),
     path("jobs/",     JobsDashboardView.as_view(), name='jobs_dashboard'),
 
 
+    path('accounts/login/', RedirectView.as_view(url='/admin/login/')),
+    path('accounts/logout/', RedirectView.as_view(url='/admin/logout/')),
+
+
+    path('accounts/', include('django.contrib.auth.urls')),
+    path('admin/', archivebox_admin.urls),
+    
+    path("api/",      include('api.urls'), name='api'),
+
+    path('health/', HealthCheckView.as_view(), name='healthcheck'),
+    path('error/', lambda *_: 1/0),                                             # type: ignore
 
 
     # path('jet_api/', include('jet_django.urls')),  Enable to use https://www.jetadmin.io/integrations/django
     # path('jet_api/', include('jet_django.urls')),  Enable to use https://www.jetadmin.io/integrations/django
 
 

+ 2 - 2
archivebox/pkgs/abx-spec-config/abx_spec_config/base_configset.py

@@ -185,7 +185,7 @@ class BaseConfigSet(BaseSettings):
             return computed_default
             return computed_default
         return value
         return value
     
     
-    def update_in_place(self, warn=True, persist=False, hint='', **kwargs):
+    def update_in_place(self, warn=False, persist=False, hint='', **kwargs):
         """
         """
         Update the config with new values. Use this sparingly! We should almost never be updating config at runtime.
         Update the config with new values. Use this sparingly! We should almost never be updating config at runtime.
         Sets them in the environment so they propagate to spawned subprocesses / across future re-__init__()s and reload from environment
         Sets them in the environment so they propagate to spawned subprocesses / across future re-__init__()s and reload from environment
@@ -201,7 +201,7 @@ class BaseConfigSet(BaseSettings):
         if all(key in _ALREADY_WARNED_ABOUT_UPDATED_CONFIG for key in kwargs.keys()):
         if all(key in _ALREADY_WARNED_ABOUT_UPDATED_CONFIG for key in kwargs.keys()):
             warn = False
             warn = False
         
         
-        if warn:
+        if warn or os.environ.get('DEBUG', '').lower() in ('true', '1', 'yes', 'on'):
             fix_scope = 'in ArchiveBox.conf' if persist else 'just for current run'
             fix_scope = 'in ArchiveBox.conf' if persist else 'just for current run'
             print(f'\n[yellow]:warning:  WARNING: Some config cannot be used as-is, fixing automatically {fix_scope}:[/yellow] {hint}', file=sys.stderr)
             print(f'\n[yellow]:warning:  WARNING: Some config cannot be used as-is, fixing automatically {fix_scope}:[/yellow] {hint}', file=sys.stderr)