| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446 |
- __package__ = 'archivebox.api'
- import math
- from uuid import UUID
- from typing import List, Optional, Union, Any
- from datetime import datetime
- from django.db.models import Q
- from django.core.exceptions import ValidationError
- from django.contrib.auth import get_user_model
- from django.shortcuts import redirect
- from ninja import Router, Schema, FilterSchema, Field, Query
- from ninja.pagination import paginate, PaginationBase
- from ninja.errors import HttpError
- from core.models import Snapshot, ArchiveResult, Tag
- from api.models import APIToken, OutboundWebhook
- from api.v1_crawls import CrawlSchema, SeedSchema
- # from .auth import API_AUTH_METHODS
- router = Router(tags=['Core Models'])
- class CustomPagination(PaginationBase):
- class Input(Schema):
- limit: int = 200
- offset: int = 0
- page: int = 0
- class Output(Schema):
- total_items: int
- total_pages: int
- page: int
- limit: int
- offset: int
- num_items: int
- items: List[Any]
- def paginate_queryset(self, queryset, pagination: Input, **params):
- limit = min(pagination.limit, 500)
- offset = pagination.offset or (pagination.page * limit)
- total = queryset.count()
- total_pages = math.ceil(total / limit)
- current_page = math.ceil(offset / (limit + 1))
- items = queryset[offset : offset + limit]
- return {
- 'total_items': total,
- 'total_pages': total_pages,
- 'page': current_page,
- 'limit': limit,
- 'offset': offset,
- 'num_items': len(items),
- 'items': items,
- }
- ### ArchiveResult #########################################################################
- class MinimalArchiveResultSchema(Schema):
- TYPE: str = 'core.models.ArchiveResult'
- id: UUID
- abid: str
- created_at: datetime | None
- modified_at: datetime | None
- created_by_id: str
- created_by_username: str
- status: str
- retry_at: datetime | None
-
- extractor: str
- cmd_version: str | None
- cmd: list[str] | None
- pwd: str | None
- output: str | None
- start_ts: datetime | None
- end_ts: datetime | None
- @staticmethod
- def resolve_created_by_id(obj):
- return str(obj.created_by_id)
-
- @staticmethod
- def resolve_created_by_username(obj) -> str:
- User = get_user_model()
- return User.objects.filter(pk=obj.created_by_id).values_list('username', flat=True)[0]
- @staticmethod
- def resolve_abid(obj):
- return str(obj.ABID)
- @staticmethod
- def resolve_snapshot_timestamp(obj):
- return obj.snapshot.timestamp
-
- @staticmethod
- def resolve_snapshot_url(obj):
- return obj.snapshot.url
- @staticmethod
- def resolve_snapshot_id(obj):
- return str(obj.snapshot_id)
-
- @staticmethod
- def resolve_snapshot_abid(obj):
- return str(obj.snapshot.ABID)
- @staticmethod
- def resolve_snapshot_tags(obj):
- return sorted(tag.name for tag in obj.snapshot.tags.all())
- class ArchiveResultSchema(MinimalArchiveResultSchema):
- TYPE: str = 'core.models.ArchiveResult'
- # ... Extends MinimalArchiveResultSchema fields ...
- snapshot_id: UUID
- snapshot_abid: str
- snapshot_timestamp: str
- snapshot_url: str
- snapshot_tags: List[str]
- class ArchiveResultFilterSchema(FilterSchema):
- id: Optional[str] = Field(None, q=['id__startswith', 'abid__icontains', 'snapshot__id__startswith', 'snapshot__abid__icontains', 'snapshot__timestamp__startswith'])
- search: Optional[str] = Field(None, q=['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'extractor', 'output__icontains', 'id__startswith', 'abid__icontains', 'snapshot__id__startswith', 'snapshot__abid__icontains', 'snapshot__timestamp__startswith'])
- snapshot_id: Optional[str] = Field(None, q=['snapshot__id__startswith', 'snapshot__abid__icontains', 'snapshot__timestamp__startswith'])
- snapshot_url: Optional[str] = Field(None, q='snapshot__url__icontains')
- snapshot_tag: Optional[str] = Field(None, q='snapshot__tags__name__icontains')
-
- status: Optional[str] = Field(None, q='status')
- output: Optional[str] = Field(None, q='output__icontains')
- extractor: Optional[str] = Field(None, q='extractor__icontains')
- cmd: Optional[str] = Field(None, q='cmd__0__icontains')
- pwd: Optional[str] = Field(None, q='pwd__icontains')
- cmd_version: Optional[str] = Field(None, q='cmd_version')
- created_at: Optional[datetime] = Field(None, q='created_at')
- created_at__gte: Optional[datetime] = Field(None, q='created_at__gte')
- created_at__lt: Optional[datetime] = Field(None, q='created_at__lt')
- @router.get("/archiveresults", response=List[ArchiveResultSchema], url_name="get_archiveresult")
- @paginate(CustomPagination)
- def get_archiveresults(request, filters: ArchiveResultFilterSchema = Query(...)):
- """List all ArchiveResult entries matching these filters."""
- qs = ArchiveResult.objects.all()
- results = filters.filter(qs).distinct()
- return results
- @router.get("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema, url_name="get_archiveresult")
- def get_archiveresult(request, archiveresult_id: str):
- """Get a specific ArchiveResult by id or abid."""
- return ArchiveResult.objects.get(Q(id__icontains=archiveresult_id) | Q(abid__icontains=archiveresult_id))
- # @router.post("/archiveresult", response=ArchiveResultSchema)
- # def create_archiveresult(request, payload: ArchiveResultSchema):
- # archiveresult = ArchiveResult.objects.create(**payload.dict())
- # return archiveresult
- #
- # @router.put("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema)
- # def update_archiveresult(request, archiveresult_id: str, payload: ArchiveResultSchema):
- # archiveresult = get_object_or_404(ArchiveResult, id=archiveresult_id)
- #
- # for attr, value in payload.dict().items():
- # setattr(archiveresult, attr, value)
- # archiveresult.save()
- #
- # return archiveresult
- #
- # @router.delete("/archiveresult/{archiveresult_id}")
- # def delete_archiveresult(request, archiveresult_id: str):
- # archiveresult = get_object_or_404(ArchiveResult, id=archiveresult_id)
- # archiveresult.delete()
- # return {"success": True}
- ### Snapshot #########################################################################
- class SnapshotSchema(Schema):
- TYPE: str = 'core.models.Snapshot'
- id: UUID
- abid: str
- created_by_id: str
- created_by_username: str
- created_at: datetime
- modified_at: datetime
-
- status: str
- retry_at: datetime | None
- bookmarked_at: datetime
- downloaded_at: Optional[datetime]
- url: str
- tags: List[str]
- title: Optional[str]
- timestamp: str
- archive_path: str
- # url_for_admin: str
- # url_for_view: str
- num_archiveresults: int
- archiveresults: List[MinimalArchiveResultSchema]
- @staticmethod
- def resolve_created_by_id(obj):
- return str(obj.created_by_id)
-
- @staticmethod
- def resolve_created_by_username(obj):
- User = get_user_model()
- return User.objects.get(id=obj.created_by_id).username
- @staticmethod
- def resolve_abid(obj):
- return str(obj.ABID)
- @staticmethod
- def resolve_tags(obj):
- return sorted(tag.name for tag in obj.tags.all())
- # @staticmethod
- # def resolve_url_for_admin(obj):
- # return f"/admin/core/snapshot/{obj.id}/change/"
-
- # @staticmethod
- # def resolve_url_for_view(obj):
- # return f"/{obj.archive_path}"
- @staticmethod
- def resolve_num_archiveresults(obj, context):
- return obj.archiveresult_set.all().distinct().count()
- @staticmethod
- def resolve_archiveresults(obj, context):
- if context['request'].with_archiveresults:
- return obj.archiveresult_set.all().distinct()
- return ArchiveResult.objects.none()
- class SnapshotFilterSchema(FilterSchema):
- id: Optional[str] = Field(None, q=['id__icontains', 'abid__icontains', 'timestamp__startswith'])
- abid: Optional[str] = Field(None, q='abid__icontains')
- created_by_id: str = Field(None, q='created_by_id')
- created_by_username: str = Field(None, q='created_by__username__icontains')
- created_at__gte: datetime = Field(None, q='created_at__gte')
- created_at__lt: datetime = Field(None, q='created_at__lt')
- created_at: datetime = Field(None, q='created_at')
- modified_at: datetime = Field(None, q='modified_at')
- modified_at__gte: datetime = Field(None, q='modified_at__gte')
- modified_at__lt: datetime = Field(None, q='modified_at__lt')
- search: Optional[str] = Field(None, q=['url__icontains', 'title__icontains', 'tags__name__icontains', 'id__icontains', 'abid__icontains', 'timestamp__startswith'])
- url: Optional[str] = Field(None, q='url')
- tag: Optional[str] = Field(None, q='tags__name')
- title: Optional[str] = Field(None, q='title__icontains')
- timestamp: Optional[str] = Field(None, q='timestamp__startswith')
-
- bookmarked_at__gte: Optional[datetime] = Field(None, q='bookmarked_at__gte')
- bookmarked_at__lt: Optional[datetime] = Field(None, q='bookmarked_at__lt')
- @router.get("/snapshots", response=List[SnapshotSchema], url_name="get_snapshots")
- @paginate(CustomPagination)
- def get_snapshots(request, filters: SnapshotFilterSchema = Query(...), with_archiveresults: bool=False):
- """List all Snapshot entries matching these filters."""
- request.with_archiveresults = with_archiveresults
- qs = Snapshot.objects.all()
- results = filters.filter(qs).distinct()
- return results
- @router.get("/snapshot/{snapshot_id}", response=SnapshotSchema, url_name="get_snapshot")
- def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True):
- """Get a specific Snapshot by abid or id."""
- request.with_archiveresults = with_archiveresults
- snapshot = None
- try:
- snapshot = Snapshot.objects.get(Q(abid__startswith=snapshot_id) | Q(id__startswith=snapshot_id) | Q(timestamp__startswith=snapshot_id))
- except Snapshot.DoesNotExist:
- pass
- try:
- snapshot = snapshot or Snapshot.objects.get(Q(abid__icontains=snapshot_id) | Q(id__icontains=snapshot_id))
- except Snapshot.DoesNotExist:
- pass
- if not snapshot:
- raise Snapshot.DoesNotExist
- return snapshot
- # @router.post("/snapshot", response=SnapshotSchema)
- # def create_snapshot(request, payload: SnapshotSchema):
- # snapshot = Snapshot.objects.create(**payload.dict())
- # return snapshot
- #
- # @router.put("/snapshot/{snapshot_id}", response=SnapshotSchema)
- # def update_snapshot(request, snapshot_id: str, payload: SnapshotSchema):
- # snapshot = get_object_or_404(Snapshot, id=snapshot_id)
- #
- # for attr, value in payload.dict().items():
- # setattr(snapshot, attr, value)
- # snapshot.save()
- #
- # return snapshot
- #
- # @router.delete("/snapshot/{snapshot_id}")
- # def delete_snapshot(request, snapshot_id: str):
- # snapshot = get_object_or_404(Snapshot, id=snapshot_id)
- # snapshot.delete()
- # return {"success": True}
- ### Tag #########################################################################
- class TagSchema(Schema):
- TYPE: str = 'core.models.Tag'
- id: UUID
- abid: str
- modified_at: datetime
- created_at: datetime
- created_by_id: str
- created_by_username: str
- name: str
- slug: str
- num_snapshots: int
- snapshots: List[SnapshotSchema]
- @staticmethod
- def resolve_created_by_id(obj):
- return str(obj.created_by_id)
-
- @staticmethod
- def resolve_created_by_username(obj):
- User = get_user_model()
- return User.objects.get(id=obj.created_by_id).username
-
- @staticmethod
- def resolve_num_snapshots(obj, context):
- return obj.snapshot_set.all().distinct().count()
- @staticmethod
- def resolve_snapshots(obj, context):
- if context['request'].with_snapshots:
- return obj.snapshot_set.all().distinct()
- return Snapshot.objects.none()
- @router.get("/tags", response=List[TagSchema], url_name="get_tags")
- @paginate(CustomPagination)
- def get_tags(request):
- request.with_snapshots = False
- request.with_archiveresults = False
- return Tag.objects.all().distinct()
- @router.get("/tag/{tag_id}", response=TagSchema, url_name="get_tag")
- def get_tag(request, tag_id: str, with_snapshots: bool=True):
- request.with_snapshots = with_snapshots
- request.with_archiveresults = False
- tag = None
- try:
- tag = Tag.objects.get(abid__icontains=tag_id)
- except (Tag.DoesNotExist, ValidationError):
- pass
- try:
- tag = tag or Tag.objects.get(id__icontains=tag_id)
- except (Tag.DoesNotExist, ValidationError):
- pass
- return tag
- @router.get("/any/{abid}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, SeedSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ABID or ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.)")
- def get_any(request, abid: str):
- """Get any object by its ABID or ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.)."""
-
- request.with_snapshots = False
- request.with_archiveresults = False
- if abid.startswith(APIToken.abid_prefix):
- raise HttpError(403, 'APIToken objects are not accessible via REST API')
-
- if abid.startswith(OutboundWebhook.abid_prefix):
- raise HttpError(403, 'OutboundWebhook objects are not accessible via REST API')
-
- response = None
- try:
- response = response or get_snapshot(request, abid)
- except Exception:
- pass
- try:
- response = response or get_archiveresult(request, abid)
- except Exception:
- pass
- try:
- response = response or get_tag(request, abid)
- except Exception:
- pass
-
- try:
- from api.v1_crawls import get_seed
- response = response or get_seed(request, abid)
- except Exception:
- pass
-
- try:
- from api.v1_crawls import get_crawl
- response = response or get_crawl(request, abid)
- except Exception:
- pass
-
- if response:
- app_label, model_name = response._meta.app_label, response._meta.model_name
- return redirect(f"/api/v1/{app_label}/{model_name}/{response.abid}?{request.META['QUERY_STRING']}")
- raise HttpError(404, 'Object with given ABID not found')
|