v1_crawls.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. __package__ = 'archivebox.api'
  2. from uuid import UUID
  3. from typing import List
  4. from datetime import datetime
  5. from django.db.models import Q
  6. from django.contrib.auth import get_user_model
  7. from ninja import Router, Schema
  8. from core.models import Snapshot
  9. from crawls.models import Seed, Crawl
  10. from .auth import API_AUTH_METHODS
  11. router = Router(tags=['Crawl Models'], auth=API_AUTH_METHODS)
  12. class SeedSchema(Schema):
  13. TYPE: str = 'crawls.models.Seed'
  14. id: UUID
  15. abid: str
  16. modified_at: datetime
  17. created_at: datetime
  18. created_by_id: str
  19. created_by_username: str
  20. uri: str
  21. tags_str: str
  22. config: dict
  23. @staticmethod
  24. def resolve_created_by_id(obj):
  25. return str(obj.created_by_id)
  26. @staticmethod
  27. def resolve_created_by_username(obj):
  28. User = get_user_model()
  29. return User.objects.get(id=obj.created_by_id).username
  30. @router.get("/seeds", response=List[SeedSchema], url_name="get_seeds")
  31. def get_seeds(request):
  32. return Seed.objects.all().distinct()
  33. @router.get("/seed/{seed_id}", response=SeedSchema, url_name="get_seed")
  34. def get_seed(request, seed_id: str):
  35. seed = None
  36. request.with_snapshots = False
  37. request.with_archiveresults = False
  38. try:
  39. seed = Seed.objects.get(Q(abid__icontains=seed_id) | Q(id__icontains=seed_id))
  40. except Exception:
  41. pass
  42. return seed
  43. class CrawlSchema(Schema):
  44. TYPE: str = 'crawls.models.Crawl'
  45. id: UUID
  46. abid: str
  47. modified_at: datetime
  48. created_at: datetime
  49. created_by_id: str
  50. created_by_username: str
  51. status: str
  52. retry_at: datetime | None
  53. seed: SeedSchema
  54. max_depth: int
  55. # snapshots: List[SnapshotSchema]
  56. @staticmethod
  57. def resolve_created_by_id(obj):
  58. return str(obj.created_by_id)
  59. @staticmethod
  60. def resolve_created_by_username(obj):
  61. User = get_user_model()
  62. return User.objects.get(id=obj.created_by_id).username
  63. @staticmethod
  64. def resolve_snapshots(obj, context):
  65. if context['request'].with_snapshots:
  66. return obj.snapshot_set.all().distinct()
  67. return Snapshot.objects.none()
  68. @router.get("/crawls", response=List[CrawlSchema], url_name="get_crawls")
  69. def get_crawls(request):
  70. return Crawl.objects.all().distinct()
  71. @router.get("/crawl/{crawl_id}", response=CrawlSchema | str, url_name="get_crawl")
  72. def get_crawl(request, crawl_id: str, as_rss: bool=False, with_snapshots: bool=False, with_archiveresults: bool=False):
  73. """Get a specific Crawl by id or abid."""
  74. crawl = None
  75. request.with_snapshots = with_snapshots
  76. request.with_archiveresults = with_archiveresults
  77. try:
  78. crawl = Crawl.objects.get(abid__icontains=crawl_id)
  79. except Exception:
  80. pass
  81. try:
  82. crawl = crawl or Crawl.objects.get(id__icontains=crawl_id)
  83. except Exception:
  84. pass
  85. if crawl and as_rss:
  86. # return snapshots as XML rss feed
  87. urls = [
  88. {'url': snapshot.url, 'title': snapshot.title, 'bookmarked_at': snapshot.bookmarked_at, 'tags': snapshot.tags_str}
  89. for snapshot in crawl.snapshot_set.all()
  90. ]
  91. xml = '<rss version="2.0"><channel>'
  92. for url in urls:
  93. xml += f'<item><url>{url["url"]}</url><title>{url["title"]}</title><bookmarked_at>{url["bookmarked_at"]}</bookmarked_at><tags>{url["tags"]}</tags></item>'
  94. xml += '</channel></rss>'
  95. return xml
  96. return crawl