2
0

schema.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
  1. import os
  2. from datetime import datetime
  3. from typing import List, Dict, Any, Optional, Union
  4. from dataclasses import dataclass, asdict, field, fields
  5. class ArchiveError(Exception):
  6. def __init__(self, message, hints=None):
  7. super().__init__(message)
  8. self.hints = hints
  9. LinkDict = Dict[str, Any]
  10. ArchiveOutput = Union[str, Exception, None]
  11. @dataclass(frozen=True)
  12. class ArchiveResult:
  13. cmd: List[str]
  14. pwd: Optional[str]
  15. cmd_version: Optional[str]
  16. output: ArchiveOutput
  17. status: str
  18. start_ts: datetime
  19. end_ts: datetime
  20. schema: str = 'ArchiveResult'
  21. def __post_init__(self):
  22. self.typecheck()
  23. def _asdict(self):
  24. return asdict(self)
  25. def typecheck(self) -> None:
  26. assert self.schema == self.__class__.__name__
  27. assert isinstance(self.status, str) and self.status
  28. assert isinstance(self.start_ts, datetime)
  29. assert isinstance(self.end_ts, datetime)
  30. assert isinstance(self.cmd, list)
  31. assert all(isinstance(arg, str) and arg for arg in self.cmd)
  32. assert self.pwd is None or isinstance(self.pwd, str) and self.pwd
  33. assert self.cmd_version is None or isinstance(self.cmd_version, str) and self.cmd_version
  34. assert self.output is None or isinstance(self.output, (str, Exception))
  35. if isinstance(self.output, str):
  36. assert self.output
  37. @classmethod
  38. def from_json(cls, json_info):
  39. from .util import parse_date
  40. info = {
  41. key: val
  42. for key, val in json_info.items()
  43. if key in cls.field_names()
  44. }
  45. info['start_ts'] = parse_date(info['start_ts'])
  46. info['end_ts'] = parse_date(info['end_ts'])
  47. return cls(**info)
  48. def to_json(self, indent=4, sort_keys=True):
  49. from .util import to_json
  50. return to_json(self, indent=indent, sort_keys=sort_keys)
  51. def to_csv(self, cols=None, ljust: int=0, separator: str=','):
  52. from .util import to_json
  53. cols = cols or self.field_names()
  54. return separator.join(
  55. to_json(getattr(self, col), indent=None).ljust(ljust)
  56. for col in cols
  57. )
  58. @classmethod
  59. def field_names(cls):
  60. return [f.name for f in fields(cls)]
  61. @property
  62. def duration(self) -> int:
  63. return (self.end_ts - self.start_ts).seconds
  64. @dataclass(frozen=True)
  65. class Link:
  66. timestamp: str
  67. url: str
  68. title: Optional[str]
  69. tags: Optional[str]
  70. sources: List[str]
  71. history: Dict[str, List[ArchiveResult]] = field(default_factory=lambda: {})
  72. updated: Optional[datetime] = None
  73. schema: str = 'Link'
  74. def __post_init__(self):
  75. self.typecheck()
  76. def overwrite(self, **kwargs):
  77. """pure functional version of dict.update that returns a new instance"""
  78. return Link(**{**self._asdict(), **kwargs})
  79. def __eq__(self, other):
  80. if not isinstance(other, Link):
  81. return NotImplemented
  82. return self.url == other.url
  83. def __gt__(self, other):
  84. if not isinstance(other, Link):
  85. return NotImplemented
  86. if not self.timestamp or not other.timestamp:
  87. return
  88. return float(self.timestamp) > float(other.timestamp)
  89. def typecheck(self) -> None:
  90. assert self.schema == self.__class__.__name__
  91. assert isinstance(self.timestamp, str) and self.timestamp
  92. assert self.timestamp.replace('.', '').isdigit()
  93. assert isinstance(self.url, str) and '://' in self.url
  94. assert self.updated is None or isinstance(self.updated, datetime)
  95. assert self.title is None or isinstance(self.title, str) and self.title
  96. assert self.tags is None or isinstance(self.tags, str) and self.tags
  97. assert isinstance(self.sources, list)
  98. assert all(isinstance(source, str) and source for source in self.sources)
  99. assert isinstance(self.history, dict)
  100. for method, results in self.history.items():
  101. assert isinstance(method, str) and method
  102. assert isinstance(results, list)
  103. assert all(isinstance(result, ArchiveResult) for result in results)
  104. def _asdict(self, extended=False):
  105. info = {
  106. 'schema': 'Link',
  107. 'url': self.url,
  108. 'title': self.title or None,
  109. 'timestamp': self.timestamp,
  110. 'updated': self.updated or None,
  111. 'tags': self.tags or None,
  112. 'sources': self.sources or [],
  113. 'history': self.history or {},
  114. }
  115. if extended:
  116. info.update({
  117. 'link_dir': self.link_dir,
  118. 'archive_path': self.archive_path,
  119. 'bookmarked_date': self.bookmarked_date,
  120. 'updated_date': self.updated_date,
  121. 'domain': self.domain,
  122. 'path': self.path,
  123. 'basename': self.basename,
  124. 'extension': self.extension,
  125. 'base_url': self.base_url,
  126. 'is_static': self.is_static,
  127. 'is_archived': self.is_archived,
  128. 'num_outputs': self.num_outputs,
  129. 'num_failures': self.num_failures,
  130. 'oldest_archive_date': self.oldest_archive_date,
  131. 'newest_archive_date': self.newest_archive_date,
  132. })
  133. return info
  134. @classmethod
  135. def from_json(cls, json_info):
  136. from .util import parse_date
  137. info = {
  138. key: val
  139. for key, val in json_info.items()
  140. if key in cls.field_names()
  141. }
  142. info['updated'] = parse_date(info['updated'])
  143. json_history = info['history']
  144. cast_history = {}
  145. for method, method_history in json_history.items():
  146. cast_history[method] = []
  147. for json_result in method_history:
  148. assert isinstance(json_result, dict), 'Items in Link["history"][method] must be dicts'
  149. cast_result = ArchiveResult.from_json(json_result)
  150. cast_history[method].append(cast_result)
  151. info['history'] = cast_history
  152. return cls(**info)
  153. def to_json(self, indent=4, sort_keys=True):
  154. from .util import to_json
  155. return to_json(self, indent=indent, sort_keys=sort_keys)
  156. def to_csv(self, csv_cols: List[str], ljust: int=0, separator: str=','):
  157. from .util import to_json
  158. return separator.join(
  159. to_json(getattr(self, col), indent=None).ljust(ljust)
  160. for col in csv_cols
  161. )
  162. @classmethod
  163. def field_names(cls):
  164. return [f.name for f in fields(cls)]
  165. @property
  166. def link_dir(self) -> str:
  167. from .config import ARCHIVE_DIR
  168. return os.path.join(ARCHIVE_DIR, self.timestamp)
  169. @property
  170. def archive_path(self) -> str:
  171. from .config import ARCHIVE_DIR_NAME
  172. return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
  173. ### URL Helpers
  174. @property
  175. def urlhash(self):
  176. from .util import hashurl
  177. return hashurl(self.url)
  178. @property
  179. def extension(self) -> str:
  180. from .util import extension
  181. return extension(self.url)
  182. @property
  183. def domain(self) -> str:
  184. from .util import domain
  185. return domain(self.url)
  186. @property
  187. def path(self) -> str:
  188. from .util import path
  189. return path(self.url)
  190. @property
  191. def basename(self) -> str:
  192. from .util import basename
  193. return basename(self.url)
  194. @property
  195. def base_url(self) -> str:
  196. from .util import base_url
  197. return base_url(self.url)
  198. ### Pretty Printing Helpers
  199. @property
  200. def bookmarked_date(self) -> Optional[str]:
  201. from .util import ts_to_date
  202. return ts_to_date(self.timestamp) if self.timestamp else None
  203. @property
  204. def updated_date(self) -> Optional[str]:
  205. from .util import ts_to_date
  206. return ts_to_date(self.updated) if self.updated else None
  207. @property
  208. def archive_dates(self) -> List[datetime]:
  209. return [
  210. result.start_ts
  211. for method in self.history.keys()
  212. for result in self.history[method]
  213. ]
  214. @property
  215. def oldest_archive_date(self) -> Optional[datetime]:
  216. return min(self.archive_dates, default=None)
  217. @property
  218. def newest_archive_date(self) -> Optional[datetime]:
  219. return max(self.archive_dates, default=None)
  220. ### Archive Status Helpers
  221. @property
  222. def num_outputs(self) -> int:
  223. return len(tuple(filter(None, self.latest_outputs().values())))
  224. @property
  225. def num_failures(self) -> int:
  226. return sum(1
  227. for method in self.history.keys()
  228. for result in self.history[method]
  229. if result.status == 'failed')
  230. @property
  231. def is_static(self) -> bool:
  232. from .util import is_static_file
  233. return is_static_file(self.url)
  234. @property
  235. def is_archived(self) -> bool:
  236. from .config import ARCHIVE_DIR
  237. from .util import domain
  238. return os.path.exists(os.path.join(
  239. ARCHIVE_DIR,
  240. self.timestamp,
  241. domain(self.url),
  242. ))
  243. def latest_outputs(self, status: str=None) -> Dict[str, ArchiveOutput]:
  244. """get the latest output that each archive method produced for link"""
  245. ARCHIVE_METHODS = (
  246. 'title', 'favicon', 'wget', 'warc', 'pdf',
  247. 'screenshot', 'dom', 'git', 'media', 'archive_org',
  248. )
  249. latest: Dict[str, ArchiveOutput] = {}
  250. for archive_method in ARCHIVE_METHODS:
  251. # get most recent succesful result in history for each archive method
  252. history = self.history.get(archive_method) or []
  253. history = list(filter(lambda result: result.output, reversed(history)))
  254. if status is not None:
  255. history = list(filter(lambda result: result.status == status, history))
  256. history = list(history)
  257. if history:
  258. latest[archive_method] = history[0].output
  259. else:
  260. latest[archive_method] = None
  261. return latest
  262. def canonical_outputs(self) -> Dict[str, Optional[str]]:
  263. from .util import wget_output_path
  264. canonical = {
  265. 'index_url': 'index.html',
  266. 'favicon_url': 'favicon.ico',
  267. 'google_favicon_url': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain),
  268. 'archive_url': wget_output_path(self),
  269. 'warc_url': 'warc',
  270. 'pdf_url': 'output.pdf',
  271. 'screenshot_url': 'screenshot.png',
  272. 'dom_url': 'output.html',
  273. 'archive_org_url': 'https://web.archive.org/web/{}'.format(self.base_url),
  274. 'git_url': 'git',
  275. 'media_url': 'media',
  276. }
  277. if self.is_static:
  278. # static binary files like PDF and images are handled slightly differently.
  279. # they're just downloaded once and aren't archived separately multiple times,
  280. # so the wget, screenshot, & pdf urls should all point to the same file
  281. static_url = wget_output_path(self)
  282. canonical.update({
  283. 'title': self.basename,
  284. 'archive_url': static_url,
  285. 'pdf_url': static_url,
  286. 'screenshot_url': static_url,
  287. 'dom_url': static_url,
  288. })
  289. return canonical