software
/
archival.ArchiveBox
peilaus alkaen https://github.com/ArchiveBox/ArchiveBox.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446
							__package__ = 'archivebox.api'

import math
from uuid import UUID
from typing import List, Optional, Union, Any
from datetime import datetime

from django.db.models import Q
from django.core.exceptions import ValidationError
from django.contrib.auth import get_user_model
from django.shortcuts import redirect

from ninja import Router, Schema, FilterSchema, Field, Query
from ninja.pagination import paginate, PaginationBase
from ninja.errors import HttpError

from core.models import Snapshot, ArchiveResult, Tag
from api.models import APIToken, OutboundWebhook
from api.v1_crawls import CrawlSchema, SeedSchema

# from .auth import API_AUTH_METHODS


router = Router(tags=['Core Models'])


class CustomPagination(PaginationBase):
    class Input(Schema):
        limit: int = 200
        offset: int = 0
        page: int = 0


    class Output(Schema):
        total_items: int
        total_pages: int
        page: int
        limit: int
        offset: int
        num_items: int
        items: List[Any]

    def paginate_queryset(self, queryset, pagination: Input, **params):
        limit = min(pagination.limit, 500)
        offset = pagination.offset or (pagination.page * limit)
        total = queryset.count()
        total_pages = math.ceil(total / limit)
        current_page = math.ceil(offset / (limit + 1))
        items = queryset[offset : offset + limit]
        return {
            'total_items': total,
            'total_pages': total_pages,
            'page': current_page,
            'limit': limit,
            'offset': offset,
            'num_items': len(items),
            'items': items,
        }


### ArchiveResult #########################################################################

class MinimalArchiveResultSchema(Schema):
    TYPE: str = 'core.models.ArchiveResult'

    id: UUID
    abid: str

    created_at: datetime | None
    modified_at: datetime | None
    created_by_id: str
    created_by_username: str

    status: str
    retry_at: datetime | None
    
    extractor: str
    cmd_version: str | None
    cmd: list[str] | None
    pwd: str | None
    output: str | None

    start_ts: datetime | None
    end_ts: datetime | None

    @staticmethod
    def resolve_created_by_id(obj):
        return str(obj.created_by_id)
    
    @staticmethod
    def resolve_created_by_username(obj) -> str:
        User = get_user_model()
        return User.objects.filter(pk=obj.created_by_id).values_list('username', flat=True)[0]

    @staticmethod
    def resolve_abid(obj):
        return str(obj.ABID)

    @staticmethod
    def resolve_snapshot_timestamp(obj):
        return obj.snapshot.timestamp
    
    @staticmethod
    def resolve_snapshot_url(obj):
        return obj.snapshot.url

    @staticmethod
    def resolve_snapshot_id(obj):
        return str(obj.snapshot_id)
    
    @staticmethod
    def resolve_snapshot_abid(obj):
        return str(obj.snapshot.ABID)

    @staticmethod
    def resolve_snapshot_tags(obj):
        return sorted(tag.name for tag in obj.snapshot.tags.all())

class ArchiveResultSchema(MinimalArchiveResultSchema):
    TYPE: str = 'core.models.ArchiveResult'

    # ... Extends MinimalArchiveResultSchema fields ...

    snapshot_id: UUID
    snapshot_abid: str
    snapshot_timestamp: str
    snapshot_url: str
    snapshot_tags: List[str]


class ArchiveResultFilterSchema(FilterSchema):
    id: Optional[str] = Field(None, q=['id__startswith', 'abid__icontains', 'snapshot__id__startswith', 'snapshot__abid__icontains', 'snapshot__timestamp__startswith'])

    search: Optional[str] = Field(None, q=['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'extractor', 'output__icontains', 'id__startswith', 'abid__icontains', 'snapshot__id__startswith', 'snapshot__abid__icontains', 'snapshot__timestamp__startswith'])
    snapshot_id: Optional[str] = Field(None, q=['snapshot__id__startswith', 'snapshot__abid__icontains', 'snapshot__timestamp__startswith'])
    snapshot_url: Optional[str] = Field(None, q='snapshot__url__icontains')
    snapshot_tag: Optional[str] = Field(None, q='snapshot__tags__name__icontains')
    
    status: Optional[str] = Field(None, q='status')
    output: Optional[str] = Field(None, q='output__icontains')
    extractor: Optional[str] = Field(None, q='extractor__icontains')
    cmd: Optional[str] = Field(None, q='cmd__0__icontains')
    pwd: Optional[str] = Field(None, q='pwd__icontains')
    cmd_version: Optional[str] = Field(None, q='cmd_version')

    created_at: Optional[datetime] = Field(None, q='created_at')
    created_at__gte: Optional[datetime] = Field(None, q='created_at__gte')
    created_at__lt: Optional[datetime] = Field(None, q='created_at__lt')


@router.get("/archiveresults", response=List[ArchiveResultSchema], url_name="get_archiveresult")
@paginate(CustomPagination)
def get_archiveresults(request, filters: ArchiveResultFilterSchema = Query(...)):
    """List all ArchiveResult entries matching these filters."""
    qs = ArchiveResult.objects.all()
    results = filters.filter(qs).distinct()
    return results


@router.get("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema, url_name="get_archiveresult")
def get_archiveresult(request, archiveresult_id: str):
    """Get a specific ArchiveResult by id or abid."""
    return ArchiveResult.objects.get(Q(id__icontains=archiveresult_id) | Q(abid__icontains=archiveresult_id))


# @router.post("/archiveresult", response=ArchiveResultSchema)
# def create_archiveresult(request, payload: ArchiveResultSchema):
#     archiveresult = ArchiveResult.objects.create(**payload.dict())
#     return archiveresult
#
# @router.put("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema)
# def update_archiveresult(request, archiveresult_id: str, payload: ArchiveResultSchema):
#     archiveresult = get_object_or_404(ArchiveResult, id=archiveresult_id)
#   
#     for attr, value in payload.dict().items():
#         setattr(archiveresult, attr, value)
#     archiveresult.save()
#
#     return archiveresult
#
# @router.delete("/archiveresult/{archiveresult_id}")
# def delete_archiveresult(request, archiveresult_id: str):
#     archiveresult = get_object_or_404(ArchiveResult, id=archiveresult_id)
#     archiveresult.delete()
#     return {"success": True}


### Snapshot #########################################################################


class SnapshotSchema(Schema):
    TYPE: str = 'core.models.Snapshot'

    id: UUID
    abid: str

    created_by_id: str
    created_by_username: str
    created_at: datetime
    modified_at: datetime
    
    status: str
    retry_at: datetime | None

    bookmarked_at: datetime
    downloaded_at: Optional[datetime]

    url: str
    tags: List[str]
    title: Optional[str]
    timestamp: str
    archive_path: str

    # url_for_admin: str
    # url_for_view: str

    num_archiveresults: int
    archiveresults: List[MinimalArchiveResultSchema]

    @staticmethod
    def resolve_created_by_id(obj):
        return str(obj.created_by_id)
    
    @staticmethod
    def resolve_created_by_username(obj):
        User = get_user_model()
        return User.objects.get(id=obj.created_by_id).username

    @staticmethod
    def resolve_abid(obj):
        return str(obj.ABID)

    @staticmethod
    def resolve_tags(obj):
        return sorted(tag.name for tag in obj.tags.all())

    # @staticmethod
    # def resolve_url_for_admin(obj):
    #     return f"/admin/core/snapshot/{obj.id}/change/"
    
    # @staticmethod
    # def resolve_url_for_view(obj):
    #     return f"/{obj.archive_path}"

    @staticmethod
    def resolve_num_archiveresults(obj, context):
        return obj.archiveresult_set.all().distinct().count()

    @staticmethod
    def resolve_archiveresults(obj, context):
        if context['request'].with_archiveresults:
            return obj.archiveresult_set.all().distinct()
        return ArchiveResult.objects.none()


class SnapshotFilterSchema(FilterSchema):
    id: Optional[str] = Field(None, q=['id__icontains', 'abid__icontains', 'timestamp__startswith'])
    abid: Optional[str] = Field(None, q='abid__icontains')

    created_by_id: str = Field(None, q='created_by_id')
    created_by_username: str = Field(None, q='created_by__username__icontains')

    created_at__gte: datetime = Field(None, q='created_at__gte')
    created_at__lt: datetime = Field(None, q='created_at__lt')
    created_at: datetime = Field(None, q='created_at')
    modified_at: datetime = Field(None, q='modified_at')
    modified_at__gte: datetime = Field(None, q='modified_at__gte')
    modified_at__lt: datetime = Field(None, q='modified_at__lt')

    search: Optional[str] = Field(None, q=['url__icontains', 'title__icontains', 'tags__name__icontains', 'id__icontains', 'abid__icontains', 'timestamp__startswith'])
    url: Optional[str] = Field(None, q='url')
    tag: Optional[str] = Field(None, q='tags__name')
    title: Optional[str] = Field(None, q='title__icontains')
    timestamp: Optional[str] = Field(None, q='timestamp__startswith')
    
    bookmarked_at__gte: Optional[datetime] = Field(None, q='bookmarked_at__gte')
    bookmarked_at__lt: Optional[datetime] = Field(None, q='bookmarked_at__lt')


@router.get("/snapshots", response=List[SnapshotSchema], url_name="get_snapshots")
@paginate(CustomPagination)
def get_snapshots(request, filters: SnapshotFilterSchema = Query(...), with_archiveresults: bool=False):
    """List all Snapshot entries matching these filters."""
    request.with_archiveresults = with_archiveresults

    qs = Snapshot.objects.all()
    results = filters.filter(qs).distinct()
    return results

@router.get("/snapshot/{snapshot_id}", response=SnapshotSchema, url_name="get_snapshot")
def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True):
    """Get a specific Snapshot by abid or id."""
    request.with_archiveresults = with_archiveresults
    snapshot = None
    try:
        snapshot = Snapshot.objects.get(Q(abid__startswith=snapshot_id) | Q(id__startswith=snapshot_id) | Q(timestamp__startswith=snapshot_id))
    except Snapshot.DoesNotExist:
        pass

    try:
        snapshot = snapshot or Snapshot.objects.get(Q(abid__icontains=snapshot_id) | Q(id__icontains=snapshot_id))
    except Snapshot.DoesNotExist:
        pass

    if not snapshot:
        raise Snapshot.DoesNotExist

    return snapshot


# @router.post("/snapshot", response=SnapshotSchema)
# def create_snapshot(request, payload: SnapshotSchema):
#     snapshot = Snapshot.objects.create(**payload.dict())
#     return snapshot
#
# @router.put("/snapshot/{snapshot_id}", response=SnapshotSchema)
# def update_snapshot(request, snapshot_id: str, payload: SnapshotSchema):
#     snapshot = get_object_or_404(Snapshot, id=snapshot_id)
#
#     for attr, value in payload.dict().items():
#         setattr(snapshot, attr, value)
#     snapshot.save()
#
#     return snapshot
#
# @router.delete("/snapshot/{snapshot_id}")
# def delete_snapshot(request, snapshot_id: str):
#     snapshot = get_object_or_404(Snapshot, id=snapshot_id)
#     snapshot.delete()
#     return {"success": True}


### Tag #########################################################################


class TagSchema(Schema):
    TYPE: str = 'core.models.Tag'

    id: UUID
    abid: str

    modified_at: datetime
    created_at: datetime
    created_by_id: str
    created_by_username: str

    name: str
    slug: str
    num_snapshots: int
    snapshots: List[SnapshotSchema]

    @staticmethod
    def resolve_created_by_id(obj):
        return str(obj.created_by_id)
    
    @staticmethod
    def resolve_created_by_username(obj):
        User = get_user_model()
        return User.objects.get(id=obj.created_by_id).username
    
    @staticmethod
    def resolve_num_snapshots(obj, context):
        return obj.snapshot_set.all().distinct().count()

    @staticmethod
    def resolve_snapshots(obj, context):
        if context['request'].with_snapshots:
            return obj.snapshot_set.all().distinct()
        return Snapshot.objects.none()

@router.get("/tags", response=List[TagSchema], url_name="get_tags")
@paginate(CustomPagination)
def get_tags(request):
    request.with_snapshots = False
    request.with_archiveresults = False
    return Tag.objects.all().distinct()

@router.get("/tag/{tag_id}", response=TagSchema, url_name="get_tag")
def get_tag(request, tag_id: str, with_snapshots: bool=True):
    request.with_snapshots = with_snapshots
    request.with_archiveresults = False
    tag = None
    try:
        tag = Tag.objects.get(abid__icontains=tag_id)
    except (Tag.DoesNotExist, ValidationError):
        pass

    try:
        tag = tag or Tag.objects.get(id__icontains=tag_id)
    except (Tag.DoesNotExist, ValidationError):
        pass
    return tag

@router.get("/any/{abid}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, SeedSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ABID or ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.)")
def get_any(request, abid: str):
    """Get any object by its ABID or ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.)."""
    
    request.with_snapshots = False
    request.with_archiveresults = False

    if abid.startswith(APIToken.abid_prefix):
        raise HttpError(403, 'APIToken objects are not accessible via REST API')
    
    if abid.startswith(OutboundWebhook.abid_prefix):
        raise HttpError(403, 'OutboundWebhook objects are not accessible via REST API')
    
    response = None
    try:
        response = response or get_snapshot(request, abid)
    except Exception:
        pass

    try:
        response = response or get_archiveresult(request, abid)
    except Exception:
        pass

    try:
        response = response or get_tag(request, abid)
    except Exception:
        pass
    
    try:
        from api.v1_crawls import get_seed
        response = response or get_seed(request, abid)
    except Exception:
        pass
    
    try:
        from api.v1_crawls import get_crawl
        response = response or get_crawl(request, abid)
    except Exception:
        pass
    
    if response:
        app_label, model_name = response._meta.app_label, response._meta.model_name
        return redirect(f"/api/v1/{app_label}/{model_name}/{response.abid}?{request.META['QUERY_STRING']}")

    raise HttpError(404, 'Object with given ABID not found')