v1_core.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486
  1. __package__ = 'archivebox.api'
  2. import math
  3. from uuid import UUID
  4. from typing import List, Optional, Union, Any
  5. from datetime import datetime
  6. from django.db.models import Q
  7. from django.core.exceptions import ValidationError
  8. from django.contrib.auth import get_user_model
  9. from ninja import Router, Schema, FilterSchema, Field, Query
  10. from ninja.pagination import paginate, PaginationBase
  11. from ninja.errors import HttpError
  12. from core.models import Snapshot, ArchiveResult, Tag
  13. from api.models import APIToken, OutboundWebhook
  14. from .auth import API_AUTH_METHODS
  15. router = Router(tags=['Core Models'], auth=API_AUTH_METHODS)
  16. class CustomPagination(PaginationBase):
  17. class Input(Schema):
  18. limit: int = 200
  19. offset: int = 0
  20. page: int = 0
  21. class Output(Schema):
  22. total_items: int
  23. total_pages: int
  24. page: int
  25. limit: int
  26. offset: int
  27. num_items: int
  28. items: List[Any]
  29. def paginate_queryset(self, queryset, pagination: Input, **params):
  30. limit = min(pagination.limit, 500)
  31. offset = pagination.offset or (pagination.page * limit)
  32. total = queryset.count()
  33. total_pages = math.ceil(total / limit)
  34. current_page = math.ceil(offset / (limit + 1))
  35. items = queryset[offset : offset + limit]
  36. return {
  37. 'total_items': total,
  38. 'total_pages': total_pages,
  39. 'page': current_page,
  40. 'limit': limit,
  41. 'offset': offset,
  42. 'num_items': len(items),
  43. 'items': items,
  44. }
  45. ### ArchiveResult #########################################################################
  46. class MinimalArchiveResultSchema(Schema):
  47. TYPE: str = 'core.models.ArchiveResult'
  48. id: UUID
  49. abid: str
  50. modified_at: datetime
  51. created_at: datetime
  52. created_by_id: str
  53. created_by_username: str
  54. extractor: str
  55. cmd_version: Optional[str]
  56. cmd: List[str]
  57. pwd: str
  58. status: str
  59. output: str
  60. start_ts: Optional[datetime]
  61. end_ts: Optional[datetime]
  62. @staticmethod
  63. def resolve_created_by_id(obj):
  64. return str(obj.created_by_id)
  65. @staticmethod
  66. def resolve_created_by_username(obj):
  67. User = get_user_model()
  68. return User.objects.get(id=obj.created_by_id).username
  69. @staticmethod
  70. def resolve_abid(obj):
  71. return str(obj.ABID)
  72. @staticmethod
  73. def resolve_created_at(obj):
  74. return obj.start_ts
  75. @staticmethod
  76. def resolve_snapshot_timestamp(obj):
  77. return obj.snapshot.timestamp
  78. @staticmethod
  79. def resolve_snapshot_url(obj):
  80. return obj.snapshot.url
  81. @staticmethod
  82. def resolve_snapshot_id(obj):
  83. return str(obj.snapshot_id)
  84. @staticmethod
  85. def resolve_snapshot_abid(obj):
  86. return str(obj.snapshot.ABID)
  87. @staticmethod
  88. def resolve_snapshot_tags(obj):
  89. return sorted(tag.name for tag in obj.snapshot.tags.all())
  90. class ArchiveResultSchema(MinimalArchiveResultSchema):
  91. TYPE: str = 'core.models.ArchiveResult'
  92. # ... Extends MinimalArchiveResultSchema fields ...
  93. snapshot_id: UUID
  94. snapshot_abid: str
  95. snapshot_timestamp: str
  96. snapshot_url: str
  97. snapshot_tags: List[str]
  98. class ArchiveResultFilterSchema(FilterSchema):
  99. id: Optional[str] = Field(None, q=['id__startswith', 'abid__icontains', 'snapshot__id__startswith', 'snapshot__abid__icontains', 'snapshot__timestamp__startswith'])
  100. search: Optional[str] = Field(None, q=['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'extractor', 'output__icontains', 'id__startswith', 'abid__icontains', 'snapshot__id__startswith', 'snapshot__abid__icontains', 'snapshot__timestamp__startswith'])
  101. snapshot_id: Optional[str] = Field(None, q=['snapshot__id__startswith', 'snapshot__abid__icontains', 'snapshot__timestamp__startswith'])
  102. snapshot_url: Optional[str] = Field(None, q='snapshot__url__icontains')
  103. snapshot_tag: Optional[str] = Field(None, q='snapshot__tags__name__icontains')
  104. status: Optional[str] = Field(None, q='status')
  105. output: Optional[str] = Field(None, q='output__icontains')
  106. extractor: Optional[str] = Field(None, q='extractor__icontains')
  107. cmd: Optional[str] = Field(None, q='cmd__0__icontains')
  108. pwd: Optional[str] = Field(None, q='pwd__icontains')
  109. cmd_version: Optional[str] = Field(None, q='cmd_version')
  110. created_at: Optional[datetime] = Field(None, q='created_at')
  111. created_at__gte: Optional[datetime] = Field(None, q='created_at__gte')
  112. created_at__lt: Optional[datetime] = Field(None, q='created_at__lt')
  113. @router.get("/archiveresults", response=List[ArchiveResultSchema], url_name="get_archiveresult")
  114. @paginate(CustomPagination)
  115. def get_archiveresults(request, filters: ArchiveResultFilterSchema = Query(...)):
  116. """List all ArchiveResult entries matching these filters."""
  117. qs = ArchiveResult.objects.all()
  118. results = filters.filter(qs).distinct()
  119. return results
  120. @router.get("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema, url_name="get_archiveresult")
  121. def get_archiveresult(request, archiveresult_id: str):
  122. """Get a specific ArchiveResult by id or abid."""
  123. return ArchiveResult.objects.get(Q(id__icontains=archiveresult_id) | Q(abid__icontains=archiveresult_id))
  124. # @router.post("/archiveresult", response=ArchiveResultSchema)
  125. # def create_archiveresult(request, payload: ArchiveResultSchema):
  126. # archiveresult = ArchiveResult.objects.create(**payload.dict())
  127. # return archiveresult
  128. #
  129. # @router.put("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema)
  130. # def update_archiveresult(request, archiveresult_id: str, payload: ArchiveResultSchema):
  131. # archiveresult = get_object_or_404(ArchiveResult, id=archiveresult_id)
  132. #
  133. # for attr, value in payload.dict().items():
  134. # setattr(archiveresult, attr, value)
  135. # archiveresult.save()
  136. #
  137. # return archiveresult
  138. #
  139. # @router.delete("/archiveresult/{archiveresult_id}")
  140. # def delete_archiveresult(request, archiveresult_id: str):
  141. # archiveresult = get_object_or_404(ArchiveResult, id=archiveresult_id)
  142. # archiveresult.delete()
  143. # return {"success": True}
  144. ### Snapshot #########################################################################
  145. class SnapshotSchema(Schema):
  146. TYPE: str = 'core.models.Snapshot'
  147. id: UUID
  148. abid: str
  149. created_by_id: str
  150. created_by_username: str
  151. created_at: datetime
  152. modified_at: datetime
  153. bookmarked_at: datetime
  154. downloaded_at: Optional[datetime]
  155. url: str
  156. tags: List[str]
  157. title: Optional[str]
  158. timestamp: str
  159. archive_path: str
  160. # url_for_admin: str
  161. # url_for_view: str
  162. num_archiveresults: int
  163. archiveresults: List[MinimalArchiveResultSchema]
  164. @staticmethod
  165. def resolve_created_by_id(obj):
  166. return str(obj.created_by_id)
  167. @staticmethod
  168. def resolve_created_by_username(obj):
  169. User = get_user_model()
  170. return User.objects.get(id=obj.created_by_id).username
  171. @staticmethod
  172. def resolve_abid(obj):
  173. return str(obj.ABID)
  174. @staticmethod
  175. def resolve_tags(obj):
  176. return sorted(tag.name for tag in obj.tags.all())
  177. # @staticmethod
  178. # def resolve_url_for_admin(obj):
  179. # return f"/admin/core/snapshot/{obj.id}/change/"
  180. # @staticmethod
  181. # def resolve_url_for_view(obj):
  182. # return f"/{obj.archive_path}"
  183. @staticmethod
  184. def resolve_num_archiveresults(obj, context):
  185. return obj.archiveresult_set.all().distinct().count()
  186. @staticmethod
  187. def resolve_archiveresults(obj, context):
  188. if context['request'].with_archiveresults:
  189. return obj.archiveresult_set.all().distinct()
  190. return ArchiveResult.objects.none()
  191. class SnapshotFilterSchema(FilterSchema):
  192. id: Optional[str] = Field(None, q=['id__icontains', 'abid__icontains', 'timestamp__startswith'])
  193. abid: Optional[str] = Field(None, q='abid__icontains')
  194. created_by_id: str = Field(None, q='created_by_id')
  195. created_by_username: str = Field(None, q='created_by__username__icontains')
  196. created_at__gte: datetime = Field(None, q='created_at__gte')
  197. created_at__lt: datetime = Field(None, q='created_at__lt')
  198. created_at: datetime = Field(None, q='created_at')
  199. modified_at: datetime = Field(None, q='modified_at')
  200. modified_at__gte: datetime = Field(None, q='modified_at__gte')
  201. modified_at__lt: datetime = Field(None, q='modified_at__lt')
  202. search: Optional[str] = Field(None, q=['url__icontains', 'title__icontains', 'tags__name__icontains', 'id__icontains', 'abid__icontains', 'timestamp__startswith'])
  203. url: Optional[str] = Field(None, q='url')
  204. tag: Optional[str] = Field(None, q='tags__name')
  205. title: Optional[str] = Field(None, q='title__icontains')
  206. timestamp: Optional[str] = Field(None, q='timestamp__startswith')
  207. bookmarked_at__gte: Optional[datetime] = Field(None, q='bookmarked_at__gte')
  208. bookmarked_at__lt: Optional[datetime] = Field(None, q='bookmarked_at__lt')
  209. @router.get("/snapshots", response=List[SnapshotSchema], url_name="get_snapshots")
  210. @paginate(CustomPagination)
  211. def get_snapshots(request, filters: SnapshotFilterSchema = Query(...), with_archiveresults: bool=False):
  212. """List all Snapshot entries matching these filters."""
  213. request.with_archiveresults = with_archiveresults
  214. qs = Snapshot.objects.all()
  215. results = filters.filter(qs).distinct()
  216. return results
  217. @router.get("/snapshot/{snapshot_id}", response=SnapshotSchema, url_name="get_snapshot")
  218. def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True):
  219. """Get a specific Snapshot by abid or id."""
  220. request.with_archiveresults = with_archiveresults
  221. snapshot = None
  222. try:
  223. snapshot = Snapshot.objects.get(Q(abid__startswith=snapshot_id) | Q(id__startswith=snapshot_id) | Q(timestamp__startswith=snapshot_id))
  224. except Snapshot.DoesNotExist:
  225. pass
  226. try:
  227. snapshot = snapshot or Snapshot.objects.get(Q(abid__icontains=snapshot_id) | Q(id__icontains=snapshot_id))
  228. except Snapshot.DoesNotExist:
  229. pass
  230. if not snapshot:
  231. raise Snapshot.DoesNotExist
  232. return snapshot
  233. # @router.post("/snapshot", response=SnapshotSchema)
  234. # def create_snapshot(request, payload: SnapshotSchema):
  235. # snapshot = Snapshot.objects.create(**payload.dict())
  236. # return snapshot
  237. #
  238. # @router.put("/snapshot/{snapshot_id}", response=SnapshotSchema)
  239. # def update_snapshot(request, snapshot_id: str, payload: SnapshotSchema):
  240. # snapshot = get_object_or_404(Snapshot, id=snapshot_id)
  241. #
  242. # for attr, value in payload.dict().items():
  243. # setattr(snapshot, attr, value)
  244. # snapshot.save()
  245. #
  246. # return snapshot
  247. #
  248. # @router.delete("/snapshot/{snapshot_id}")
  249. # def delete_snapshot(request, snapshot_id: str):
  250. # snapshot = get_object_or_404(Snapshot, id=snapshot_id)
  251. # snapshot.delete()
  252. # return {"success": True}
  253. ### Tag #########################################################################
  254. class TagSchema(Schema):
  255. TYPE: str = 'core.models.Tag'
  256. id: UUID
  257. abid: str
  258. modified_at: datetime
  259. created_at: datetime
  260. created_by_id: str
  261. created_by_username: str
  262. name: str
  263. slug: str
  264. num_snapshots: int
  265. snapshots: List[SnapshotSchema]
  266. @staticmethod
  267. def resolve_created_by_id(obj):
  268. return str(obj.created_by_id)
  269. @staticmethod
  270. def resolve_created_by_username(obj):
  271. User = get_user_model()
  272. return User.objects.get(id=obj.created_by_id).username
  273. @staticmethod
  274. def resolve_num_snapshots(obj, context):
  275. return obj.snapshot_set.all().distinct().count()
  276. @staticmethod
  277. def resolve_snapshots(obj, context):
  278. if context['request'].with_snapshots:
  279. return obj.snapshot_set.all().distinct()
  280. return Snapshot.objects.none()
  281. @router.get("/tags", response=List[TagSchema], url_name="get_tags")
  282. @paginate(CustomPagination)
  283. def get_tags(request):
  284. request.with_snapshots = False
  285. request.with_archiveresults = False
  286. return Tag.objects.all().distinct()
  287. @router.get("/tag/{tag_id}", response=TagSchema, url_name="get_tag")
  288. def get_tag(request, tag_id: str, with_snapshots: bool=True):
  289. request.with_snapshots = with_snapshots
  290. request.with_archiveresults = False
  291. tag = None
  292. try:
  293. tag = Tag.objects.get(abid__icontains=tag_id)
  294. except (Tag.DoesNotExist, ValidationError):
  295. pass
  296. try:
  297. tag = tag or Tag.objects.get(id__icontains=tag_id)
  298. except (Tag.DoesNotExist, ValidationError):
  299. pass
  300. return tag
  301. # class CrawlSchema(Schema):
  302. # TYPE: str = 'core.models.Crawl'
  303. # id: UUID
  304. # abid: str
  305. # modified_at: datetime
  306. # created_at: datetime
  307. # created_by_id: str
  308. # created_by_username: str
  309. # urls: str
  310. # depth: int
  311. # parser: str
  312. # # snapshots: List[SnapshotSchema]
  313. # @staticmethod
  314. # def resolve_created_by_id(obj):
  315. # return str(obj.created_by_id)
  316. # @staticmethod
  317. # def resolve_created_by_username(obj):
  318. # User = get_user_model()
  319. # return User.objects.get(id=obj.created_by_id).username
  320. # @staticmethod
  321. # def resolve_snapshots(obj, context):
  322. # if context['request'].with_snapshots:
  323. # return obj.snapshot_set.all().distinct()
  324. # return Snapshot.objects.none()
  325. # @router.get("/crawl/{crawl_id}", response=CrawlSchema, url_name="get_crawl")
  326. # def get_crawl(request, crawl_id: str, with_snapshots: bool=False, with_archiveresults: bool=False):
  327. # """Get a specific Crawl by id or abid."""
  328. # crawl = None
  329. # request.with_snapshots = with_snapshots
  330. # request.with_archiveresults = with_archiveresults
  331. # try:
  332. # crawl = Crawl.objects.get(abid__icontains=crawl_id)
  333. # except Exception:
  334. # pass
  335. # try:
  336. # crawl = crawl or Crawl.objects.get(id__icontains=crawl_id)
  337. # except Exception:
  338. # pass
  339. # return crawl
  340. # [..., CrawlSchema]
  341. @router.get("/any/{abid}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema], url_name="get_any")
  342. def get_any(request, abid: str):
  343. request.with_snapshots = False
  344. request.with_archiveresults = False
  345. if abid.startswith(APIToken.abid_prefix):
  346. raise HttpError(403, 'APIToken objects are not accessible via REST API')
  347. if abid.startswith(OutboundWebhook.abid_prefix):
  348. raise HttpError(403, 'OutboundWebhook objects are not accessible via REST API')
  349. response = None
  350. try:
  351. response = response or get_snapshot(request, abid)
  352. except Exception:
  353. pass
  354. try:
  355. response = response or get_archiveresult(request, abid)
  356. except Exception:
  357. pass
  358. try:
  359. response = response or get_tag(request, abid)
  360. except Exception:
  361. pass
  362. # try:
  363. # response = response or get_crawl(request, abid)
  364. # except Exception:
  365. # pass
  366. if not response:
  367. raise HttpError(404, 'Object with given ABID not found')
  368. return response