v1_cli.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239
  1. __package__ = 'archivebox.api'
  2. import json
  3. from typing import List, Dict, Any, Optional
  4. from enum import Enum
  5. from ninja import Router, Schema
  6. from archivebox.main import (
  7. add,
  8. remove,
  9. update,
  10. list_all,
  11. schedule,
  12. )
  13. from archivebox.misc.util import ansi_to_html
  14. from archivebox.config.common import ARCHIVING_CONFIG
  15. # from .auth import API_AUTH_METHODS
  16. # router for API that exposes archivebox cli subcommands as REST endpoints
  17. router = Router(tags=['ArchiveBox CLI Sub-Commands'])
  18. # Schemas
  19. JSONType = List[Any] | Dict[str, Any] | bool | int | str | None
  20. class CLICommandResponseSchema(Schema):
  21. success: bool
  22. errors: List[str]
  23. result: JSONType
  24. result_format: str = 'str'
  25. stdout: str
  26. stderr: str
  27. class FilterTypeChoices(str, Enum):
  28. exact = 'exact'
  29. substring = 'substring'
  30. regex = 'regex'
  31. domain = 'domain'
  32. tag = 'tag'
  33. timestamp = 'timestamp'
  34. class StatusChoices(str, Enum):
  35. indexed = 'indexed'
  36. archived = 'archived'
  37. unarchived = 'unarchived'
  38. present = 'present'
  39. valid = 'valid'
  40. invalid = 'invalid'
  41. duplicate = 'duplicate'
  42. orphaned = 'orphaned'
  43. corrupted = 'corrupted'
  44. unrecognized = 'unrecognized'
  45. class AddCommandSchema(Schema):
  46. urls: List[str]
  47. tag: str = ""
  48. depth: int = 0
  49. update: bool = not ARCHIVING_CONFIG.ONLY_NEW # Default to the opposite of ARCHIVING_CONFIG.ONLY_NEW
  50. update_all: bool = False
  51. index_only: bool = False
  52. overwrite: bool = False
  53. init: bool = False
  54. extractors: str = ""
  55. parser: str = "auto"
  56. class UpdateCommandSchema(Schema):
  57. resume: Optional[float] = 0
  58. only_new: bool = ARCHIVING_CONFIG.ONLY_NEW
  59. index_only: bool = False
  60. overwrite: bool = False
  61. after: Optional[float] = 0
  62. before: Optional[float] = 999999999999999
  63. status: Optional[StatusChoices] = StatusChoices.unarchived
  64. filter_type: Optional[str] = FilterTypeChoices.substring
  65. filter_patterns: Optional[List[str]] = ['https://example.com']
  66. extractors: Optional[str] = ""
  67. class ScheduleCommandSchema(Schema):
  68. import_path: Optional[str] = None
  69. add: bool = False
  70. every: Optional[str] = None
  71. tag: str = ''
  72. depth: int = 0
  73. overwrite: bool = False
  74. update: bool = not ARCHIVING_CONFIG.ONLY_NEW
  75. clear: bool = False
  76. class ListCommandSchema(Schema):
  77. filter_patterns: Optional[List[str]] = ['https://example.com']
  78. filter_type: str = FilterTypeChoices.substring
  79. status: Optional[StatusChoices] = StatusChoices.indexed
  80. after: Optional[float] = 0
  81. before: Optional[float] = 999999999999999
  82. sort: str = 'bookmarked_at'
  83. as_json: bool = True
  84. as_html: bool = False
  85. as_csv: str | None = 'timestamp,url'
  86. with_headers: bool = False
  87. class RemoveCommandSchema(Schema):
  88. delete: bool = True
  89. after: Optional[float] = 0
  90. before: Optional[float] = 999999999999999
  91. filter_type: str = FilterTypeChoices.exact
  92. filter_patterns: Optional[List[str]] = ['https://example.com']
  93. @router.post("/add", response=CLICommandResponseSchema, summary='archivebox add [args] [urls]')
  94. def cli_add(request, args: AddCommandSchema):
  95. result = add(
  96. urls=args.urls,
  97. tag=args.tag,
  98. depth=args.depth,
  99. update=args.update,
  100. update_all=args.update_all,
  101. index_only=args.index_only,
  102. overwrite=args.overwrite,
  103. init=args.init,
  104. extractors=args.extractors,
  105. parser=args.parser,
  106. )
  107. return {
  108. "success": True,
  109. "errors": [],
  110. "result": result,
  111. "stdout": ansi_to_html(request.stdout.getvalue().strip()),
  112. "stderr": ansi_to_html(request.stderr.getvalue().strip()),
  113. }
  114. @router.post("/update", response=CLICommandResponseSchema, summary='archivebox update [args] [filter_patterns]')
  115. def cli_update(request, args: UpdateCommandSchema):
  116. result = update(
  117. resume=args.resume,
  118. only_new=args.only_new,
  119. index_only=args.index_only,
  120. overwrite=args.overwrite,
  121. before=args.before,
  122. after=args.after,
  123. status=args.status,
  124. filter_type=args.filter_type,
  125. filter_patterns=args.filter_patterns,
  126. extractors=args.extractors,
  127. )
  128. return {
  129. "success": True,
  130. "errors": [],
  131. "result": result,
  132. "stdout": ansi_to_html(request.stdout.getvalue().strip()),
  133. "stderr": ansi_to_html(request.stderr.getvalue().strip()),
  134. }
  135. @router.post("/schedule", response=CLICommandResponseSchema, summary='archivebox schedule [args] [import_path]')
  136. def cli_schedule(request, args: ScheduleCommandSchema):
  137. result = schedule(
  138. import_path=args.import_path,
  139. add=args.add,
  140. show=args.show,
  141. clear=args.clear,
  142. every=args.every,
  143. tag=args.tag,
  144. depth=args.depth,
  145. overwrite=args.overwrite,
  146. update=args.update,
  147. )
  148. return {
  149. "success": True,
  150. "errors": [],
  151. "result": result,
  152. "stdout": ansi_to_html(request.stdout.getvalue().strip()),
  153. "stderr": ansi_to_html(request.stderr.getvalue().strip()),
  154. }
  155. @router.post("/list", response=CLICommandResponseSchema, summary='archivebox list [args] [filter_patterns] (use this endpoint with ?filter_type=search to search for snapshots)')
  156. def cli_list(request, args: ListCommandSchema):
  157. result = list_all(
  158. filter_patterns=args.filter_patterns,
  159. filter_type=args.filter_type,
  160. status=args.status,
  161. after=args.after,
  162. before=args.before,
  163. sort=args.sort,
  164. csv=args.as_csv,
  165. json=args.as_json,
  166. html=args.as_html,
  167. with_headers=args.with_headers,
  168. )
  169. result_format = 'txt'
  170. if args.as_json:
  171. result_format = "json"
  172. result = json.loads(result)
  173. elif args.as_html:
  174. result_format = "html"
  175. elif args.as_csv:
  176. result_format = "csv"
  177. return {
  178. "success": True,
  179. "errors": [],
  180. "result": result,
  181. "result_format": result_format,
  182. "stdout": ansi_to_html(request.stdout.getvalue().strip()),
  183. "stderr": ansi_to_html(request.stderr.getvalue().strip()),
  184. }
  185. @router.post("/remove", response=CLICommandResponseSchema, summary='archivebox remove [args] [filter_patterns]')
  186. def cli_remove(request, args: RemoveCommandSchema):
  187. result = remove(
  188. yes=True, # no way to interactively ask for confirmation via API, so we force yes
  189. delete=args.delete,
  190. before=args.before,
  191. after=args.after,
  192. filter_type=args.filter_type,
  193. filter_patterns=args.filter_patterns,
  194. )
  195. return {
  196. "success": True,
  197. "errors": [],
  198. "result": result,
  199. "stdout": ansi_to_html(request.stdout.getvalue().strip()),
  200. "stderr": ansi_to_html(request.stderr.getvalue().strip()),
  201. }