v1_core.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446
  1. __package__ = 'archivebox.api'
  2. import math
  3. from uuid import UUID
  4. from typing import List, Optional, Union, Any
  5. from datetime import datetime
  6. from django.db.models import Q
  7. from django.core.exceptions import ValidationError
  8. from django.contrib.auth import get_user_model
  9. from django.shortcuts import redirect
  10. from ninja import Router, Schema, FilterSchema, Field, Query
  11. from ninja.pagination import paginate, PaginationBase
  12. from ninja.errors import HttpError
  13. from core.models import Snapshot, ArchiveResult, Tag
  14. from api.models import APIToken, OutboundWebhook
  15. from api.v1_crawls import CrawlSchema, SeedSchema
  16. # from .auth import API_AUTH_METHODS
  17. router = Router(tags=['Core Models'])
  18. class CustomPagination(PaginationBase):
  19. class Input(Schema):
  20. limit: int = 200
  21. offset: int = 0
  22. page: int = 0
  23. class Output(Schema):
  24. total_items: int
  25. total_pages: int
  26. page: int
  27. limit: int
  28. offset: int
  29. num_items: int
  30. items: List[Any]
  31. def paginate_queryset(self, queryset, pagination: Input, **params):
  32. limit = min(pagination.limit, 500)
  33. offset = pagination.offset or (pagination.page * limit)
  34. total = queryset.count()
  35. total_pages = math.ceil(total / limit)
  36. current_page = math.ceil(offset / (limit + 1))
  37. items = queryset[offset : offset + limit]
  38. return {
  39. 'total_items': total,
  40. 'total_pages': total_pages,
  41. 'page': current_page,
  42. 'limit': limit,
  43. 'offset': offset,
  44. 'num_items': len(items),
  45. 'items': items,
  46. }
  47. ### ArchiveResult #########################################################################
  48. class MinimalArchiveResultSchema(Schema):
  49. TYPE: str = 'core.models.ArchiveResult'
  50. id: UUID
  51. abid: str
  52. created_at: datetime | None
  53. modified_at: datetime | None
  54. created_by_id: str
  55. created_by_username: str
  56. status: str
  57. retry_at: datetime | None
  58. extractor: str
  59. cmd_version: str | None
  60. cmd: list[str] | None
  61. pwd: str | None
  62. output: str | None
  63. start_ts: datetime | None
  64. end_ts: datetime | None
  65. @staticmethod
  66. def resolve_created_by_id(obj):
  67. return str(obj.created_by_id)
  68. @staticmethod
  69. def resolve_created_by_username(obj) -> str:
  70. User = get_user_model()
  71. return User.objects.filter(pk=obj.created_by_id).values_list('username', flat=True)[0]
  72. @staticmethod
  73. def resolve_abid(obj):
  74. return str(obj.ABID)
  75. @staticmethod
  76. def resolve_snapshot_timestamp(obj):
  77. return obj.snapshot.timestamp
  78. @staticmethod
  79. def resolve_snapshot_url(obj):
  80. return obj.snapshot.url
  81. @staticmethod
  82. def resolve_snapshot_id(obj):
  83. return str(obj.snapshot_id)
  84. @staticmethod
  85. def resolve_snapshot_abid(obj):
  86. return str(obj.snapshot.ABID)
  87. @staticmethod
  88. def resolve_snapshot_tags(obj):
  89. return sorted(tag.name for tag in obj.snapshot.tags.all())
  90. class ArchiveResultSchema(MinimalArchiveResultSchema):
  91. TYPE: str = 'core.models.ArchiveResult'
  92. # ... Extends MinimalArchiveResultSchema fields ...
  93. snapshot_id: UUID
  94. snapshot_abid: str
  95. snapshot_timestamp: str
  96. snapshot_url: str
  97. snapshot_tags: List[str]
  98. class ArchiveResultFilterSchema(FilterSchema):
  99. id: Optional[str] = Field(None, q=['id__startswith', 'abid__icontains', 'snapshot__id__startswith', 'snapshot__abid__icontains', 'snapshot__timestamp__startswith'])
  100. search: Optional[str] = Field(None, q=['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'extractor', 'output__icontains', 'id__startswith', 'abid__icontains', 'snapshot__id__startswith', 'snapshot__abid__icontains', 'snapshot__timestamp__startswith'])
  101. snapshot_id: Optional[str] = Field(None, q=['snapshot__id__startswith', 'snapshot__abid__icontains', 'snapshot__timestamp__startswith'])
  102. snapshot_url: Optional[str] = Field(None, q='snapshot__url__icontains')
  103. snapshot_tag: Optional[str] = Field(None, q='snapshot__tags__name__icontains')
  104. status: Optional[str] = Field(None, q='status')
  105. output: Optional[str] = Field(None, q='output__icontains')
  106. extractor: Optional[str] = Field(None, q='extractor__icontains')
  107. cmd: Optional[str] = Field(None, q='cmd__0__icontains')
  108. pwd: Optional[str] = Field(None, q='pwd__icontains')
  109. cmd_version: Optional[str] = Field(None, q='cmd_version')
  110. created_at: Optional[datetime] = Field(None, q='created_at')
  111. created_at__gte: Optional[datetime] = Field(None, q='created_at__gte')
  112. created_at__lt: Optional[datetime] = Field(None, q='created_at__lt')
  113. @router.get("/archiveresults", response=List[ArchiveResultSchema], url_name="get_archiveresult")
  114. @paginate(CustomPagination)
  115. def get_archiveresults(request, filters: ArchiveResultFilterSchema = Query(...)):
  116. """List all ArchiveResult entries matching these filters."""
  117. qs = ArchiveResult.objects.all()
  118. results = filters.filter(qs).distinct()
  119. return results
  120. @router.get("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema, url_name="get_archiveresult")
  121. def get_archiveresult(request, archiveresult_id: str):
  122. """Get a specific ArchiveResult by id or abid."""
  123. return ArchiveResult.objects.get(Q(id__icontains=archiveresult_id) | Q(abid__icontains=archiveresult_id))
  124. # @router.post("/archiveresult", response=ArchiveResultSchema)
  125. # def create_archiveresult(request, payload: ArchiveResultSchema):
  126. # archiveresult = ArchiveResult.objects.create(**payload.dict())
  127. # return archiveresult
  128. #
  129. # @router.put("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema)
  130. # def update_archiveresult(request, archiveresult_id: str, payload: ArchiveResultSchema):
  131. # archiveresult = get_object_or_404(ArchiveResult, id=archiveresult_id)
  132. #
  133. # for attr, value in payload.dict().items():
  134. # setattr(archiveresult, attr, value)
  135. # archiveresult.save()
  136. #
  137. # return archiveresult
  138. #
  139. # @router.delete("/archiveresult/{archiveresult_id}")
  140. # def delete_archiveresult(request, archiveresult_id: str):
  141. # archiveresult = get_object_or_404(ArchiveResult, id=archiveresult_id)
  142. # archiveresult.delete()
  143. # return {"success": True}
  144. ### Snapshot #########################################################################
  145. class SnapshotSchema(Schema):
  146. TYPE: str = 'core.models.Snapshot'
  147. id: UUID
  148. abid: str
  149. created_by_id: str
  150. created_by_username: str
  151. created_at: datetime
  152. modified_at: datetime
  153. status: str
  154. retry_at: datetime | None
  155. bookmarked_at: datetime
  156. downloaded_at: Optional[datetime]
  157. url: str
  158. tags: List[str]
  159. title: Optional[str]
  160. timestamp: str
  161. archive_path: str
  162. # url_for_admin: str
  163. # url_for_view: str
  164. num_archiveresults: int
  165. archiveresults: List[MinimalArchiveResultSchema]
  166. @staticmethod
  167. def resolve_created_by_id(obj):
  168. return str(obj.created_by_id)
  169. @staticmethod
  170. def resolve_created_by_username(obj):
  171. User = get_user_model()
  172. return User.objects.get(id=obj.created_by_id).username
  173. @staticmethod
  174. def resolve_abid(obj):
  175. return str(obj.ABID)
  176. @staticmethod
  177. def resolve_tags(obj):
  178. return sorted(tag.name for tag in obj.tags.all())
  179. # @staticmethod
  180. # def resolve_url_for_admin(obj):
  181. # return f"/admin/core/snapshot/{obj.id}/change/"
  182. # @staticmethod
  183. # def resolve_url_for_view(obj):
  184. # return f"/{obj.archive_path}"
  185. @staticmethod
  186. def resolve_num_archiveresults(obj, context):
  187. return obj.archiveresult_set.all().distinct().count()
  188. @staticmethod
  189. def resolve_archiveresults(obj, context):
  190. if context['request'].with_archiveresults:
  191. return obj.archiveresult_set.all().distinct()
  192. return ArchiveResult.objects.none()
  193. class SnapshotFilterSchema(FilterSchema):
  194. id: Optional[str] = Field(None, q=['id__icontains', 'abid__icontains', 'timestamp__startswith'])
  195. abid: Optional[str] = Field(None, q='abid__icontains')
  196. created_by_id: str = Field(None, q='created_by_id')
  197. created_by_username: str = Field(None, q='created_by__username__icontains')
  198. created_at__gte: datetime = Field(None, q='created_at__gte')
  199. created_at__lt: datetime = Field(None, q='created_at__lt')
  200. created_at: datetime = Field(None, q='created_at')
  201. modified_at: datetime = Field(None, q='modified_at')
  202. modified_at__gte: datetime = Field(None, q='modified_at__gte')
  203. modified_at__lt: datetime = Field(None, q='modified_at__lt')
  204. search: Optional[str] = Field(None, q=['url__icontains', 'title__icontains', 'tags__name__icontains', 'id__icontains', 'abid__icontains', 'timestamp__startswith'])
  205. url: Optional[str] = Field(None, q='url')
  206. tag: Optional[str] = Field(None, q='tags__name')
  207. title: Optional[str] = Field(None, q='title__icontains')
  208. timestamp: Optional[str] = Field(None, q='timestamp__startswith')
  209. bookmarked_at__gte: Optional[datetime] = Field(None, q='bookmarked_at__gte')
  210. bookmarked_at__lt: Optional[datetime] = Field(None, q='bookmarked_at__lt')
  211. @router.get("/snapshots", response=List[SnapshotSchema], url_name="get_snapshots")
  212. @paginate(CustomPagination)
  213. def get_snapshots(request, filters: SnapshotFilterSchema = Query(...), with_archiveresults: bool=False):
  214. """List all Snapshot entries matching these filters."""
  215. request.with_archiveresults = with_archiveresults
  216. qs = Snapshot.objects.all()
  217. results = filters.filter(qs).distinct()
  218. return results
  219. @router.get("/snapshot/{snapshot_id}", response=SnapshotSchema, url_name="get_snapshot")
  220. def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True):
  221. """Get a specific Snapshot by abid or id."""
  222. request.with_archiveresults = with_archiveresults
  223. snapshot = None
  224. try:
  225. snapshot = Snapshot.objects.get(Q(abid__startswith=snapshot_id) | Q(id__startswith=snapshot_id) | Q(timestamp__startswith=snapshot_id))
  226. except Snapshot.DoesNotExist:
  227. pass
  228. try:
  229. snapshot = snapshot or Snapshot.objects.get(Q(abid__icontains=snapshot_id) | Q(id__icontains=snapshot_id))
  230. except Snapshot.DoesNotExist:
  231. pass
  232. if not snapshot:
  233. raise Snapshot.DoesNotExist
  234. return snapshot
  235. # @router.post("/snapshot", response=SnapshotSchema)
  236. # def create_snapshot(request, payload: SnapshotSchema):
  237. # snapshot = Snapshot.objects.create(**payload.dict())
  238. # return snapshot
  239. #
  240. # @router.put("/snapshot/{snapshot_id}", response=SnapshotSchema)
  241. # def update_snapshot(request, snapshot_id: str, payload: SnapshotSchema):
  242. # snapshot = get_object_or_404(Snapshot, id=snapshot_id)
  243. #
  244. # for attr, value in payload.dict().items():
  245. # setattr(snapshot, attr, value)
  246. # snapshot.save()
  247. #
  248. # return snapshot
  249. #
  250. # @router.delete("/snapshot/{snapshot_id}")
  251. # def delete_snapshot(request, snapshot_id: str):
  252. # snapshot = get_object_or_404(Snapshot, id=snapshot_id)
  253. # snapshot.delete()
  254. # return {"success": True}
  255. ### Tag #########################################################################
  256. class TagSchema(Schema):
  257. TYPE: str = 'core.models.Tag'
  258. id: UUID
  259. abid: str
  260. modified_at: datetime
  261. created_at: datetime
  262. created_by_id: str
  263. created_by_username: str
  264. name: str
  265. slug: str
  266. num_snapshots: int
  267. snapshots: List[SnapshotSchema]
  268. @staticmethod
  269. def resolve_created_by_id(obj):
  270. return str(obj.created_by_id)
  271. @staticmethod
  272. def resolve_created_by_username(obj):
  273. User = get_user_model()
  274. return User.objects.get(id=obj.created_by_id).username
  275. @staticmethod
  276. def resolve_num_snapshots(obj, context):
  277. return obj.snapshot_set.all().distinct().count()
  278. @staticmethod
  279. def resolve_snapshots(obj, context):
  280. if context['request'].with_snapshots:
  281. return obj.snapshot_set.all().distinct()
  282. return Snapshot.objects.none()
  283. @router.get("/tags", response=List[TagSchema], url_name="get_tags")
  284. @paginate(CustomPagination)
  285. def get_tags(request):
  286. request.with_snapshots = False
  287. request.with_archiveresults = False
  288. return Tag.objects.all().distinct()
  289. @router.get("/tag/{tag_id}", response=TagSchema, url_name="get_tag")
  290. def get_tag(request, tag_id: str, with_snapshots: bool=True):
  291. request.with_snapshots = with_snapshots
  292. request.with_archiveresults = False
  293. tag = None
  294. try:
  295. tag = Tag.objects.get(abid__icontains=tag_id)
  296. except (Tag.DoesNotExist, ValidationError):
  297. pass
  298. try:
  299. tag = tag or Tag.objects.get(id__icontains=tag_id)
  300. except (Tag.DoesNotExist, ValidationError):
  301. pass
  302. return tag
  303. @router.get("/any/{abid}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, SeedSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ABID or ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.)")
  304. def get_any(request, abid: str):
  305. """Get any object by its ABID or ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.)."""
  306. request.with_snapshots = False
  307. request.with_archiveresults = False
  308. if abid.startswith(APIToken.abid_prefix):
  309. raise HttpError(403, 'APIToken objects are not accessible via REST API')
  310. if abid.startswith(OutboundWebhook.abid_prefix):
  311. raise HttpError(403, 'OutboundWebhook objects are not accessible via REST API')
  312. response = None
  313. try:
  314. response = response or get_snapshot(request, abid)
  315. except Exception:
  316. pass
  317. try:
  318. response = response or get_archiveresult(request, abid)
  319. except Exception:
  320. pass
  321. try:
  322. response = response or get_tag(request, abid)
  323. except Exception:
  324. pass
  325. try:
  326. from api.v1_crawls import get_seed
  327. response = response or get_seed(request, abid)
  328. except Exception:
  329. pass
  330. try:
  331. from api.v1_crawls import get_crawl
  332. response = response or get_crawl(request, abid)
  333. except Exception:
  334. pass
  335. if response:
  336. app_label, model_name = response._meta.app_label, response._meta.model_name
  337. return redirect(f"/api/v1/{app_label}/{model_name}/{response.abid}?{request.META['QUERY_STRING']}")
  338. raise HttpError(404, 'Object with given ABID not found')