extractors.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. __package__ = 'archivebox.plugantic'
  2. from typing import Optional, List, Literal, Annotated, Dict, Any
  3. from typing_extensions import Self
  4. from abc import ABC
  5. from pathlib import Path
  6. from pydantic import BaseModel, model_validator, field_serializer, AfterValidator
  7. from .binaries import (
  8. Binary,
  9. YtdlpBinary,
  10. WgetBinary,
  11. )
  12. # stubs
  13. class Snapshot:
  14. pass
  15. class ArchiveResult:
  16. pass
  17. def get_wget_output_path(*args, **kwargs) -> Path:
  18. return Path('.').resolve()
  19. def no_empty_args(args: List[str]) -> List[str]:
  20. assert all(len(arg) for arg in args)
  21. return args
  22. ExtractorName = Literal['wget', 'warc', 'media']
  23. HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
  24. CmdArgsList = Annotated[List[str], AfterValidator(no_empty_args)]
  25. class Extractor(ABC, BaseModel):
  26. name: ExtractorName
  27. binary: Binary
  28. output_path_func: HandlerFuncStr = 'self.get_output_path'
  29. should_extract_func: HandlerFuncStr = 'self.should_extract'
  30. extract_func: HandlerFuncStr = 'self.extract'
  31. exec_func: HandlerFuncStr = 'self.exec'
  32. default_args: CmdArgsList = []
  33. extra_args: CmdArgsList = []
  34. args: Optional[CmdArgsList] = None
  35. @model_validator(mode='after')
  36. def validate_model(self) -> Self:
  37. if self.args is None:
  38. self.args = [*self.default_args, *self.extra_args]
  39. return self
  40. @field_serializer('binary', when_used='json')
  41. def dump_binary(binary) -> str:
  42. return binary.name
  43. def get_output_path(self, snapshot) -> Path:
  44. return Path(self.name)
  45. def should_extract(self, snapshot) -> bool:
  46. output_dir = self.get_output_path(snapshot)
  47. if output_dir.glob('*.*'):
  48. return False
  49. return True
  50. def extract(self, url: str, **kwargs) -> Dict[str, Any]:
  51. output_dir = self.get_output_path(url, **kwargs)
  52. cmd = [url, *self.args] if self.args is not None else [url, *self.default_args, *self.extra_args]
  53. proc = self.exec(cmd, pwd=output_dir)
  54. return {
  55. 'status': 'succeeded' if proc.returncode == 0 else 'failed',
  56. 'output': proc.stdout.decode().strip().split('\n')[-1],
  57. 'output_files': list(output_dir.glob('*.*')),
  58. 'stdout': proc.stdout.decode().strip(),
  59. 'stderr': proc.stderr.decode().strip(),
  60. 'returncode': proc.returncode,
  61. }
  62. def exec(self, args: CmdArgsList, pwd: Optional[Path]=None):
  63. pwd = pwd or Path('.')
  64. assert self.binary.loaded_provider
  65. return self.binary.exec(args, pwd=pwd)
  66. class YtdlpExtractor(Extractor):
  67. name: ExtractorName = 'media'
  68. binary: Binary = YtdlpBinary()
  69. def get_output_path(self, snapshot) -> Path:
  70. return Path(self.name)
  71. class WgetExtractor(Extractor):
  72. name: ExtractorName = 'wget'
  73. binary: Binary = WgetBinary()
  74. def get_output_path(self, snapshot) -> Path:
  75. return get_wget_output_path(snapshot)
  76. class WarcExtractor(Extractor):
  77. name: ExtractorName = 'warc'
  78. binary: Binary = WgetBinary()
  79. def get_output_path(self, snapshot) -> Path:
  80. return get_wget_output_path(snapshot)