v1_core.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459
  1. __package__ = 'archivebox.api'
  2. import math
  3. from uuid import UUID
  4. from typing import List, Optional, Union, Any
  5. from datetime import datetime
  6. from django.db.models import Q
  7. from django.core.exceptions import ValidationError
  8. from django.contrib.auth import get_user_model
  9. from django.shortcuts import redirect
  10. from ninja import Router, Schema, FilterSchema, Field, Query
  11. from ninja.pagination import paginate, PaginationBase
  12. from ninja.errors import HttpError
  13. from archivebox.core.models import Snapshot, ArchiveResult, Tag
  14. from archivebox.api.v1_crawls import CrawlSchema
  15. router = Router(tags=['Core Models'])
  16. class CustomPagination(PaginationBase):
  17. class Input(Schema):
  18. limit: int = 200
  19. offset: int = 0
  20. page: int = 0
  21. class Output(Schema):
  22. total_items: int
  23. total_pages: int
  24. page: int
  25. limit: int
  26. offset: int
  27. num_items: int
  28. items: List[Any]
  29. def paginate_queryset(self, queryset, pagination: Input, **params):
  30. limit = min(pagination.limit, 500)
  31. offset = pagination.offset or (pagination.page * limit)
  32. total = queryset.count()
  33. total_pages = math.ceil(total / limit)
  34. current_page = math.ceil(offset / (limit + 1))
  35. items = queryset[offset : offset + limit]
  36. return {
  37. 'total_items': total,
  38. 'total_pages': total_pages,
  39. 'page': current_page,
  40. 'limit': limit,
  41. 'offset': offset,
  42. 'num_items': len(items),
  43. 'items': items,
  44. }
  45. ### ArchiveResult #########################################################################
  46. class MinimalArchiveResultSchema(Schema):
  47. TYPE: str = 'core.models.ArchiveResult'
  48. id: UUID
  49. created_at: datetime | None
  50. modified_at: datetime | None
  51. created_by_id: str
  52. created_by_username: str
  53. status: str
  54. retry_at: datetime | None
  55. plugin: str
  56. hook_name: str
  57. process_id: UUID | None
  58. cmd_version: str | None
  59. cmd: list[str] | None
  60. pwd: str | None
  61. output_str: str
  62. output_json: dict | None
  63. output_files: dict | None
  64. output_size: int
  65. output_mimetypes: str
  66. start_ts: datetime | None
  67. end_ts: datetime | None
  68. @staticmethod
  69. def resolve_created_by_id(obj):
  70. return str(obj.created_by.pk)
  71. @staticmethod
  72. def resolve_created_by_username(obj) -> str:
  73. return obj.created_by.username
  74. class ArchiveResultSchema(MinimalArchiveResultSchema):
  75. TYPE: str = 'core.models.ArchiveResult'
  76. snapshot_id: UUID
  77. snapshot_timestamp: str
  78. snapshot_url: str
  79. snapshot_tags: List[str]
  80. @staticmethod
  81. def resolve_snapshot_timestamp(obj):
  82. return obj.snapshot.timestamp
  83. @staticmethod
  84. def resolve_snapshot_url(obj):
  85. return obj.snapshot.url
  86. @staticmethod
  87. def resolve_snapshot_id(obj):
  88. return obj.snapshot_id
  89. @staticmethod
  90. def resolve_snapshot_tags(obj):
  91. return sorted(tag.name for tag in obj.snapshot.tags.all())
  92. class ArchiveResultFilterSchema(FilterSchema):
  93. id: Optional[str] = Field(None, q=['id__startswith', 'snapshot__id__startswith', 'snapshot__timestamp__startswith'])
  94. search: Optional[str] = Field(None, q=['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'plugin', 'output_str__icontains', 'id__startswith', 'snapshot__id__startswith', 'snapshot__timestamp__startswith'])
  95. snapshot_id: Optional[str] = Field(None, q=['snapshot__id__startswith', 'snapshot__timestamp__startswith'])
  96. snapshot_url: Optional[str] = Field(None, q='snapshot__url__icontains')
  97. snapshot_tag: Optional[str] = Field(None, q='snapshot__tags__name__icontains')
  98. status: Optional[str] = Field(None, q='status')
  99. output_str: Optional[str] = Field(None, q='output_str__icontains')
  100. plugin: Optional[str] = Field(None, q='plugin__icontains')
  101. hook_name: Optional[str] = Field(None, q='hook_name__icontains')
  102. process_id: Optional[str] = Field(None, q='process__id__startswith')
  103. cmd: Optional[str] = Field(None, q='cmd__0__icontains')
  104. pwd: Optional[str] = Field(None, q='pwd__icontains')
  105. cmd_version: Optional[str] = Field(None, q='cmd_version')
  106. created_at: Optional[datetime] = Field(None, q='created_at')
  107. created_at__gte: Optional[datetime] = Field(None, q='created_at__gte')
  108. created_at__lt: Optional[datetime] = Field(None, q='created_at__lt')
  109. @router.get("/archiveresults", response=List[ArchiveResultSchema], url_name="get_archiveresult")
  110. @paginate(CustomPagination)
  111. def get_archiveresults(request, filters: ArchiveResultFilterSchema = Query(...)):
  112. """List all ArchiveResult entries matching these filters."""
  113. return filters.filter(ArchiveResult.objects.all()).distinct()
  114. @router.get("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema, url_name="get_archiveresult")
  115. def get_archiveresult(request, archiveresult_id: str):
  116. """Get a specific ArchiveResult by id."""
  117. return ArchiveResult.objects.get(Q(id__icontains=archiveresult_id))
  118. ### Snapshot #########################################################################
  119. class SnapshotSchema(Schema):
  120. TYPE: str = 'core.models.Snapshot'
  121. id: UUID
  122. created_by_id: str
  123. created_by_username: str
  124. created_at: datetime
  125. modified_at: datetime
  126. status: str
  127. retry_at: datetime | None
  128. bookmarked_at: datetime
  129. downloaded_at: Optional[datetime]
  130. url: str
  131. tags: List[str]
  132. title: Optional[str]
  133. timestamp: str
  134. archive_path: str
  135. num_archiveresults: int
  136. archiveresults: List[MinimalArchiveResultSchema]
  137. @staticmethod
  138. def resolve_created_by_id(obj):
  139. return str(obj.created_by.pk)
  140. @staticmethod
  141. def resolve_created_by_username(obj):
  142. return obj.created_by.username
  143. @staticmethod
  144. def resolve_tags(obj):
  145. return sorted(tag.name for tag in obj.tags.all())
  146. @staticmethod
  147. def resolve_num_archiveresults(obj, context):
  148. return obj.archiveresult_set.all().distinct().count()
  149. @staticmethod
  150. def resolve_archiveresults(obj, context):
  151. if context['request'].with_archiveresults:
  152. return obj.archiveresult_set.all().distinct()
  153. return ArchiveResult.objects.none()
  154. class SnapshotFilterSchema(FilterSchema):
  155. id: Optional[str] = Field(None, q=['id__icontains', 'timestamp__startswith'])
  156. created_by_id: str = Field(None, q='crawl__created_by_id')
  157. created_by_username: str = Field(None, q='crawl__created_by__username__icontains')
  158. created_at__gte: datetime = Field(None, q='created_at__gte')
  159. created_at__lt: datetime = Field(None, q='created_at__lt')
  160. created_at: datetime = Field(None, q='created_at')
  161. modified_at: datetime = Field(None, q='modified_at')
  162. modified_at__gte: datetime = Field(None, q='modified_at__gte')
  163. modified_at__lt: datetime = Field(None, q='modified_at__lt')
  164. search: Optional[str] = Field(None, q=['url__icontains', 'title__icontains', 'tags__name__icontains', 'id__icontains', 'timestamp__startswith'])
  165. url: Optional[str] = Field(None, q='url')
  166. tag: Optional[str] = Field(None, q='tags__name')
  167. title: Optional[str] = Field(None, q='title__icontains')
  168. timestamp: Optional[str] = Field(None, q='timestamp__startswith')
  169. bookmarked_at__gte: Optional[datetime] = Field(None, q='bookmarked_at__gte')
  170. bookmarked_at__lt: Optional[datetime] = Field(None, q='bookmarked_at__lt')
  171. @router.get("/snapshots", response=List[SnapshotSchema], url_name="get_snapshots")
  172. @paginate(CustomPagination)
  173. def get_snapshots(request, filters: SnapshotFilterSchema = Query(...), with_archiveresults: bool = False):
  174. """List all Snapshot entries matching these filters."""
  175. request.with_archiveresults = with_archiveresults
  176. return filters.filter(Snapshot.objects.all()).distinct()
  177. @router.get("/snapshot/{snapshot_id}", response=SnapshotSchema, url_name="get_snapshot")
  178. def get_snapshot(request, snapshot_id: str, with_archiveresults: bool = True):
  179. """Get a specific Snapshot by id."""
  180. request.with_archiveresults = with_archiveresults
  181. try:
  182. return Snapshot.objects.get(Q(id__startswith=snapshot_id) | Q(timestamp__startswith=snapshot_id))
  183. except Snapshot.DoesNotExist:
  184. return Snapshot.objects.get(Q(id__icontains=snapshot_id))
  185. ### Tag #########################################################################
  186. class TagSchema(Schema):
  187. TYPE: str = 'core.models.Tag'
  188. id: UUID
  189. modified_at: datetime
  190. created_at: datetime
  191. created_by_id: str
  192. created_by_username: str
  193. name: str
  194. slug: str
  195. num_snapshots: int
  196. snapshots: List[SnapshotSchema]
  197. @staticmethod
  198. def resolve_created_by_id(obj):
  199. return str(obj.created_by_id)
  200. @staticmethod
  201. def resolve_created_by_username(obj):
  202. User = get_user_model()
  203. return User.objects.get(id=obj.created_by_id).username
  204. @staticmethod
  205. def resolve_num_snapshots(obj, context):
  206. return obj.snapshot_set.all().distinct().count()
  207. @staticmethod
  208. def resolve_snapshots(obj, context):
  209. if context['request'].with_snapshots:
  210. return obj.snapshot_set.all().distinct()
  211. return Snapshot.objects.none()
  212. @router.get("/tags", response=List[TagSchema], url_name="get_tags")
  213. @paginate(CustomPagination)
  214. def get_tags(request):
  215. request.with_snapshots = False
  216. request.with_archiveresults = False
  217. return Tag.objects.all().distinct()
  218. @router.get("/tag/{tag_id}", response=TagSchema, url_name="get_tag")
  219. def get_tag(request, tag_id: str, with_snapshots: bool = True):
  220. request.with_snapshots = with_snapshots
  221. request.with_archiveresults = False
  222. try:
  223. return Tag.objects.get(id__icontains=tag_id)
  224. except (Tag.DoesNotExist, ValidationError):
  225. return Tag.objects.get(slug__icontains=tag_id)
  226. @router.get("/any/{id}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ID")
  227. def get_any(request, id: str):
  228. """Get any object by its ID (e.g. snapshot, archiveresult, tag, crawl, etc.)."""
  229. request.with_snapshots = False
  230. request.with_archiveresults = False
  231. for getter in [get_snapshot, get_archiveresult, get_tag]:
  232. try:
  233. response = getter(request, id)
  234. if response:
  235. return redirect(f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.id}?{request.META['QUERY_STRING']}")
  236. except Exception:
  237. pass
  238. try:
  239. from archivebox.api.v1_crawls import get_crawl
  240. response = get_crawl(request, id)
  241. if response:
  242. return redirect(f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.id}?{request.META['QUERY_STRING']}")
  243. except Exception:
  244. pass
  245. raise HttpError(404, 'Object with given ID not found')
  246. ### Tag Editor API Endpoints #########################################################################
  247. class TagAutocompleteSchema(Schema):
  248. tags: List[dict]
  249. class TagCreateSchema(Schema):
  250. name: str
  251. class TagCreateResponseSchema(Schema):
  252. success: bool
  253. tag_id: int
  254. tag_name: str
  255. created: bool
  256. class TagSnapshotRequestSchema(Schema):
  257. snapshot_id: str
  258. tag_name: Optional[str] = None
  259. tag_id: Optional[int] = None
  260. class TagSnapshotResponseSchema(Schema):
  261. success: bool
  262. tag_id: int
  263. tag_name: str
  264. @router.get("/tags/autocomplete/", response=TagAutocompleteSchema, url_name="tags_autocomplete")
  265. def tags_autocomplete(request, q: str = ""):
  266. """Return tags matching the query for autocomplete."""
  267. if not q:
  268. # Return all tags if no query (limited to 50)
  269. tags = Tag.objects.all().order_by('name')[:50]
  270. else:
  271. tags = Tag.objects.filter(name__icontains=q).order_by('name')[:20]
  272. return {
  273. 'tags': [{'id': tag.pk, 'name': tag.name, 'slug': tag.slug} for tag in tags]
  274. }
  275. @router.post("/tags/create/", response=TagCreateResponseSchema, url_name="tags_create")
  276. def tags_create(request, data: TagCreateSchema):
  277. """Create a new tag or return existing one."""
  278. name = data.name.strip()
  279. if not name:
  280. raise HttpError(400, 'Tag name is required')
  281. tag, created = Tag.objects.get_or_create(
  282. name__iexact=name,
  283. defaults={
  284. 'name': name,
  285. 'created_by': request.user if request.user.is_authenticated else None,
  286. }
  287. )
  288. # If found by case-insensitive match, use that tag
  289. if not created:
  290. tag = Tag.objects.filter(name__iexact=name).first()
  291. return {
  292. 'success': True,
  293. 'tag_id': tag.pk,
  294. 'tag_name': tag.name,
  295. 'created': created,
  296. }
  297. @router.post("/tags/add-to-snapshot/", response=TagSnapshotResponseSchema, url_name="tags_add_to_snapshot")
  298. def tags_add_to_snapshot(request, data: TagSnapshotRequestSchema):
  299. """Add a tag to a snapshot. Creates the tag if it doesn't exist."""
  300. # Get the snapshot
  301. try:
  302. snapshot = Snapshot.objects.get(
  303. Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id)
  304. )
  305. except Snapshot.DoesNotExist:
  306. raise HttpError(404, 'Snapshot not found')
  307. except Snapshot.MultipleObjectsReturned:
  308. snapshot = Snapshot.objects.filter(
  309. Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id)
  310. ).first()
  311. # Get or create the tag
  312. if data.tag_name:
  313. name = data.tag_name.strip()
  314. if not name:
  315. raise HttpError(400, 'Tag name is required')
  316. tag, _ = Tag.objects.get_or_create(
  317. name__iexact=name,
  318. defaults={
  319. 'name': name,
  320. 'created_by': request.user if request.user.is_authenticated else None,
  321. }
  322. )
  323. # If found by case-insensitive match, use that tag
  324. tag = Tag.objects.filter(name__iexact=name).first() or tag
  325. elif data.tag_id:
  326. try:
  327. tag = Tag.objects.get(pk=data.tag_id)
  328. except Tag.DoesNotExist:
  329. raise HttpError(404, 'Tag not found')
  330. else:
  331. raise HttpError(400, 'Either tag_name or tag_id is required')
  332. # Add the tag to the snapshot
  333. snapshot.tags.add(tag)
  334. return {
  335. 'success': True,
  336. 'tag_id': tag.pk,
  337. 'tag_name': tag.name,
  338. }
  339. @router.post("/tags/remove-from-snapshot/", response=TagSnapshotResponseSchema, url_name="tags_remove_from_snapshot")
  340. def tags_remove_from_snapshot(request, data: TagSnapshotRequestSchema):
  341. """Remove a tag from a snapshot."""
  342. # Get the snapshot
  343. try:
  344. snapshot = Snapshot.objects.get(
  345. Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id)
  346. )
  347. except Snapshot.DoesNotExist:
  348. raise HttpError(404, 'Snapshot not found')
  349. except Snapshot.MultipleObjectsReturned:
  350. snapshot = Snapshot.objects.filter(
  351. Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id)
  352. ).first()
  353. # Get the tag
  354. if data.tag_id:
  355. try:
  356. tag = Tag.objects.get(pk=data.tag_id)
  357. except Tag.DoesNotExist:
  358. raise HttpError(404, 'Tag not found')
  359. elif data.tag_name:
  360. try:
  361. tag = Tag.objects.get(name__iexact=data.tag_name.strip())
  362. except Tag.DoesNotExist:
  363. raise HttpError(404, 'Tag not found')
  364. else:
  365. raise HttpError(400, 'Either tag_name or tag_id is required')
  366. # Remove the tag from the snapshot
  367. snapshot.tags.remove(tag)
  368. return {
  369. 'success': True,
  370. 'tag_id': tag.pk,
  371. 'tag_name': tag.name,
  372. }