schema.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468
  1. """
  2. WARNING: THIS FILE IS ALL LEGACY CODE TO BE REMOVED.
  3. DO NOT ADD ANY NEW FEATURES TO THIS FILE, NEW CODE GOES HERE: core/models.py
  4. These are the old types we used to use before ArchiveBox v0.4 (before we switched to Django).
  5. """
  6. __package__ = 'archivebox.index'
  7. from datetime import datetime, timezone, timedelta
  8. from typing import List, Dict, Any, Optional, Union
  9. from dataclasses import dataclass, asdict, field, fields
  10. from django.utils.functional import cached_property
  11. from archivebox.config import ARCHIVE_DIR, CONSTANTS
  12. from plugins_extractor.favicon.config import FAVICON_CONFIG
  13. from archivebox.misc.system import get_dir_size
  14. from archivebox.misc.util import ts_to_date_str, parse_date
  15. from archivebox.misc.logging import stderr, ANSI
  16. class ArchiveError(Exception):
  17. def __init__(self, message, hints=None):
  18. super().__init__(message)
  19. self.hints = hints
  20. LinkDict = Dict[str, Any]
  21. ArchiveOutput = Union[str, Exception, None]
  22. @dataclass(frozen=True)
  23. class ArchiveResult:
  24. cmd: List[str]
  25. pwd: Optional[str]
  26. cmd_version: Optional[str]
  27. output: ArchiveOutput
  28. status: str
  29. start_ts: datetime
  30. end_ts: datetime
  31. index_texts: Union[List[str], None] = None
  32. schema: str = 'ArchiveResult'
  33. def __post_init__(self):
  34. self.typecheck()
  35. def _asdict(self):
  36. return asdict(self)
  37. def typecheck(self) -> None:
  38. assert self.schema == self.__class__.__name__
  39. assert isinstance(self.status, str) and self.status
  40. assert isinstance(self.start_ts, datetime)
  41. assert isinstance(self.end_ts, datetime)
  42. assert isinstance(self.cmd, list)
  43. assert all(isinstance(arg, str) and arg for arg in self.cmd)
  44. # TODO: replace emptystrings in these three with None / remove them from the DB
  45. assert self.pwd is None or isinstance(self.pwd, str)
  46. assert self.cmd_version is None or isinstance(self.cmd_version, str)
  47. assert self.output is None or isinstance(self.output, (str, Exception))
  48. @classmethod
  49. def guess_ts(_cls, dict_info):
  50. parsed_timestamp = parse_date(dict_info["timestamp"])
  51. start_ts = parsed_timestamp
  52. end_ts = parsed_timestamp + timedelta(seconds=int(dict_info["duration"]))
  53. return start_ts, end_ts
  54. @classmethod
  55. def from_json(cls, json_info, guess=False):
  56. info = {
  57. key: val
  58. for key, val in json_info.items()
  59. if key in cls.field_names()
  60. }
  61. if guess:
  62. keys = info.keys()
  63. if "start_ts" not in keys:
  64. info["start_ts"], info["end_ts"] = cls.guess_ts(json_info)
  65. else:
  66. info['start_ts'] = parse_date(info['start_ts'])
  67. info['end_ts'] = parse_date(info['end_ts'])
  68. if "pwd" not in keys:
  69. info["pwd"] = str(ARCHIVE_DIR / json_info["timestamp"])
  70. if "cmd_version" not in keys:
  71. info["cmd_version"] = "Undefined"
  72. if "cmd" not in keys:
  73. info["cmd"] = []
  74. else:
  75. info['start_ts'] = parse_date(info['start_ts'])
  76. info['end_ts'] = parse_date(info['end_ts'])
  77. info['cmd_version'] = info.get('cmd_version')
  78. if type(info["cmd"]) is str:
  79. info["cmd"] = [info["cmd"]]
  80. return cls(**info)
  81. def to_dict(self, *keys) -> dict:
  82. if keys:
  83. return {k: v for k, v in asdict(self).items() if k in keys}
  84. return asdict(self)
  85. def to_json(self, indent=4, sort_keys=True) -> str:
  86. from .json import to_json
  87. return to_json(self, indent=indent, sort_keys=sort_keys)
  88. def to_csv(self, cols: Optional[List[str]]=None, separator: str=',', ljust: int=0) -> str:
  89. from .csv import to_csv
  90. return to_csv(self, csv_col=cols or self.field_names(), separator=separator, ljust=ljust)
  91. @classmethod
  92. def field_names(cls):
  93. return [f.name for f in fields(cls)]
  94. @property
  95. def duration(self) -> int:
  96. return (self.end_ts - self.start_ts).seconds
  97. @dataclass(frozen=True)
  98. class Link:
  99. timestamp: str
  100. url: str
  101. title: Optional[str]
  102. tags: Optional[str]
  103. sources: List[str]
  104. history: Dict[str, List[ArchiveResult]] = field(default_factory=lambda: {})
  105. downloaded_at: Optional[datetime] = None
  106. schema: str = 'Link'
  107. def __str__(self) -> str:
  108. return f'[{self.timestamp}] {self.url} "{self.title}"'
  109. def __post_init__(self):
  110. self.typecheck()
  111. def overwrite(self, **kwargs):
  112. """pure functional version of dict.update that returns a new instance"""
  113. return Link(**{**self._asdict(), **kwargs})
  114. def __eq__(self, other):
  115. if not isinstance(other, Link):
  116. return NotImplemented
  117. return self.url == other.url
  118. def __gt__(self, other):
  119. if not isinstance(other, Link):
  120. return NotImplemented
  121. if not self.timestamp or not other.timestamp:
  122. return
  123. return float(self.timestamp) > float(other.timestamp)
  124. def typecheck(self) -> None:
  125. try:
  126. assert self.schema == self.__class__.__name__
  127. assert isinstance(self.timestamp, str) and self.timestamp
  128. assert self.timestamp.replace('.', '').isdigit()
  129. assert isinstance(self.url, str) and '://' in self.url
  130. assert self.downloaded_at is None or isinstance(self.downloaded_at, datetime)
  131. assert self.title is None or (isinstance(self.title, str) and self.title)
  132. assert self.tags is None or isinstance(self.tags, str)
  133. assert isinstance(self.sources, list)
  134. assert all(isinstance(source, str) and source for source in self.sources)
  135. assert isinstance(self.history, dict)
  136. for method, results in self.history.items():
  137. assert isinstance(method, str) and method
  138. assert isinstance(results, list)
  139. assert all(isinstance(result, ArchiveResult) for result in results)
  140. except Exception:
  141. stderr('{red}[X] Error while loading link! [{}] {} "{}"{reset}'.format(self.timestamp, self.url, self.title, **ANSI))
  142. raise
  143. def _asdict(self, extended=False):
  144. info = {
  145. 'schema': 'Link',
  146. 'url': self.url,
  147. 'title': self.title or None,
  148. 'timestamp': self.timestamp,
  149. 'downloaded_at': self.downloaded_at or None,
  150. 'tags': self.tags or None,
  151. 'sources': self.sources or [],
  152. 'history': self.history or {},
  153. }
  154. if extended:
  155. info.update({
  156. 'snapshot_id': self.snapshot_id,
  157. 'snapshot_abid': self.snapshot_abid,
  158. 'link_dir': self.link_dir,
  159. 'archive_path': self.archive_path,
  160. 'hash': self.url_hash,
  161. 'base_url': self.base_url,
  162. 'scheme': self.scheme,
  163. 'domain': self.domain,
  164. 'path': self.path,
  165. 'basename': self.basename,
  166. 'extension': self.extension,
  167. 'is_static': self.is_static,
  168. 'tags_str': (self.tags or '').strip(','), # only used to render static index in index/html.py, remove if no longer needed there
  169. 'icons': None, # only used to render static index in index/html.py, remove if no longer needed there
  170. 'bookmarked_date': self.bookmarked_date,
  171. 'downloaded_datestr': self.downloaded_datestr,
  172. 'oldest_archive_date': self.oldest_archive_date,
  173. 'newest_archive_date': self.newest_archive_date,
  174. 'is_archived': self.is_archived,
  175. 'num_outputs': self.num_outputs,
  176. 'num_failures': self.num_failures,
  177. 'latest': self.latest_outputs(),
  178. 'canonical': self.canonical_outputs(),
  179. })
  180. return info
  181. def as_snapshot(self):
  182. from core.models import Snapshot
  183. return Snapshot.objects.get(url=self.url)
  184. @classmethod
  185. def from_json(cls, json_info, guess=False):
  186. info = {
  187. key: val
  188. for key, val in json_info.items()
  189. if key in cls.field_names()
  190. }
  191. info['downloaded_at'] = parse_date(info.get('updated') or info.get('downloaded_at'))
  192. info['sources'] = info.get('sources') or []
  193. json_history = info.get('history') or {}
  194. cast_history = {}
  195. for method, method_history in json_history.items():
  196. cast_history[method] = []
  197. for json_result in method_history:
  198. assert isinstance(json_result, dict), 'Items in Link["history"][method] must be dicts'
  199. cast_result = ArchiveResult.from_json(json_result, guess)
  200. cast_history[method].append(cast_result)
  201. info['history'] = cast_history
  202. return cls(**info)
  203. def to_json(self, indent=4, sort_keys=True) -> str:
  204. from .json import to_json
  205. return to_json(self, indent=indent, sort_keys=sort_keys)
  206. def to_csv(self, cols: Optional[List[str]]=None, separator: str=',', ljust: int=0) -> str:
  207. from .csv import to_csv
  208. return to_csv(self, cols=cols or self.field_names(), separator=separator, ljust=ljust)
  209. @cached_property
  210. def snapshot(self):
  211. from core.models import Snapshot
  212. return Snapshot.objects.only('id', 'abid').get(url=self.url)
  213. @cached_property
  214. def snapshot_id(self):
  215. return str(self.snapshot.pk)
  216. @cached_property
  217. def snapshot_abid(self):
  218. return str(self.snapshot.ABID)
  219. @classmethod
  220. def field_names(cls):
  221. return [f.name for f in fields(cls)]
  222. @property
  223. def link_dir(self) -> str:
  224. return str(ARCHIVE_DIR / self.timestamp)
  225. @property
  226. def archive_path(self) -> str:
  227. return '{}/{}'.format(CONSTANTS.ARCHIVE_DIR_NAME, self.timestamp)
  228. @property
  229. def archive_size(self) -> float:
  230. try:
  231. return get_dir_size(self.archive_path)[0]
  232. except Exception:
  233. return 0
  234. ### URL Helpers
  235. @property
  236. def url_hash(self):
  237. from archivebox.misc.util import hashurl
  238. return hashurl(self.url)
  239. @property
  240. def scheme(self) -> str:
  241. from archivebox.misc.util import scheme
  242. return scheme(self.url)
  243. @property
  244. def extension(self) -> str:
  245. from archivebox.misc.util import extension
  246. return extension(self.url)
  247. @property
  248. def domain(self) -> str:
  249. from archivebox.misc.util import domain
  250. return domain(self.url)
  251. @property
  252. def path(self) -> str:
  253. from archivebox.misc.util import path
  254. return path(self.url)
  255. @property
  256. def basename(self) -> str:
  257. from archivebox.misc.util import basename
  258. return basename(self.url)
  259. @property
  260. def base_url(self) -> str:
  261. from archivebox.misc.util import base_url
  262. return base_url(self.url)
  263. ### Pretty Printing Helpers
  264. @property
  265. def bookmarked_date(self) -> Optional[str]:
  266. max_ts = (datetime.now(timezone.utc) + timedelta(days=30)).timestamp()
  267. if self.timestamp and self.timestamp.replace('.', '').isdigit():
  268. if 0 < float(self.timestamp) < max_ts:
  269. return ts_to_date_str(datetime.fromtimestamp(float(self.timestamp)))
  270. else:
  271. return str(self.timestamp)
  272. return None
  273. @property
  274. def downloaded_datestr(self) -> Optional[str]:
  275. return ts_to_date_str(self.downloaded_at) if self.downloaded_at else None
  276. @property
  277. def archive_dates(self) -> List[datetime]:
  278. return [
  279. parse_date(result.start_ts)
  280. for method in self.history.keys()
  281. for result in self.history[method]
  282. ]
  283. @property
  284. def oldest_archive_date(self) -> Optional[datetime]:
  285. return min(self.archive_dates, default=None)
  286. @property
  287. def newest_archive_date(self) -> Optional[datetime]:
  288. return max(self.archive_dates, default=None)
  289. ### Archive Status Helpers
  290. @property
  291. def num_outputs(self) -> int:
  292. return self.as_snapshot().num_outputs
  293. @property
  294. def num_failures(self) -> int:
  295. return sum(1
  296. for method in self.history.keys()
  297. for result in self.history[method]
  298. if result.status == 'failed')
  299. @property
  300. def is_static(self) -> bool:
  301. from archivebox.misc.util import is_static_file
  302. return is_static_file(self.url)
  303. @property
  304. def is_archived(self) -> bool:
  305. from archivebox.misc.util import domain
  306. output_paths = (
  307. domain(self.url),
  308. 'output.html',
  309. 'output.pdf',
  310. 'screenshot.png',
  311. 'singlefile.html',
  312. 'readability/content.html',
  313. 'mercury/content.html',
  314. 'htmltotext.txt',
  315. 'media',
  316. 'git',
  317. )
  318. return any(
  319. (ARCHIVE_DIR / self.timestamp / path).exists()
  320. for path in output_paths
  321. )
  322. def latest_outputs(self, status: str=None) -> Dict[str, ArchiveOutput]:
  323. """get the latest output that each archive method produced for link"""
  324. ARCHIVE_METHODS = (
  325. 'title', 'favicon', 'wget', 'warc', 'singlefile', 'pdf',
  326. 'screenshot', 'dom', 'git', 'media', 'archive_org',
  327. )
  328. latest: Dict[str, ArchiveOutput] = {}
  329. for archive_method in ARCHIVE_METHODS:
  330. # get most recent succesful result in history for each archive method
  331. history = self.history.get(archive_method) or []
  332. history = list(filter(lambda result: result.output, reversed(history)))
  333. if status is not None:
  334. history = list(filter(lambda result: result.status == status, history))
  335. history = list(history)
  336. if history:
  337. latest[archive_method] = history[0].output
  338. else:
  339. latest[archive_method] = None
  340. return latest
  341. def canonical_outputs(self) -> Dict[str, Optional[str]]:
  342. """predict the expected output paths that should be present after archiving"""
  343. from ..extractors.wget import wget_output_path
  344. # TODO: banish this awful duplication from the codebase and import these
  345. # from their respective extractor files
  346. canonical = {
  347. 'index_path': 'index.html',
  348. 'favicon_path': 'favicon.ico',
  349. 'google_favicon_path': FAVICON_CONFIG.FAVICON_PROVIDER.format(self.domain),
  350. 'wget_path': wget_output_path(self),
  351. 'warc_path': 'warc/',
  352. 'singlefile_path': 'singlefile.html',
  353. 'readability_path': 'readability/content.html',
  354. 'mercury_path': 'mercury/content.html',
  355. 'htmltotext_path': 'htmltotext.txt',
  356. 'pdf_path': 'output.pdf',
  357. 'screenshot_path': 'screenshot.png',
  358. 'dom_path': 'output.html',
  359. 'archive_org_path': 'https://web.archive.org/web/{}'.format(self.base_url),
  360. 'git_path': 'git/',
  361. 'media_path': 'media/',
  362. 'headers_path': 'headers.json',
  363. }
  364. if self.is_static:
  365. # static binary files like PDF and images are handled slightly differently.
  366. # they're just downloaded once and aren't archived separately multiple times,
  367. # so the wget, screenshot, & pdf urls should all point to the same file
  368. static_path = wget_output_path(self)
  369. canonical.update({
  370. 'title': self.basename,
  371. 'wget_path': static_path,
  372. 'pdf_path': static_path,
  373. 'screenshot_path': static_path,
  374. 'dom_path': static_path,
  375. 'singlefile_path': static_path,
  376. 'readability_path': static_path,
  377. 'mercury_path': static_path,
  378. 'htmltotext_path': static_path,
  379. })
  380. return canonical