archive_methods.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660
  1. import os
  2. from typing import Dict, List, Tuple
  3. from collections import defaultdict
  4. from datetime import datetime
  5. from schema import Link, ArchiveResult, ArchiveError
  6. from index import (
  7. write_link_index,
  8. patch_links_index,
  9. load_json_link_index,
  10. )
  11. from config import (
  12. CURL_BINARY,
  13. GIT_BINARY,
  14. WGET_BINARY,
  15. YOUTUBEDL_BINARY,
  16. FETCH_FAVICON,
  17. FETCH_TITLE,
  18. FETCH_WGET,
  19. FETCH_WGET_REQUISITES,
  20. FETCH_PDF,
  21. FETCH_SCREENSHOT,
  22. FETCH_DOM,
  23. FETCH_WARC,
  24. FETCH_GIT,
  25. FETCH_MEDIA,
  26. SUBMIT_ARCHIVE_DOT_ORG,
  27. TIMEOUT,
  28. MEDIA_TIMEOUT,
  29. ANSI,
  30. OUTPUT_DIR,
  31. GIT_DOMAINS,
  32. GIT_SHA,
  33. WGET_USER_AGENT,
  34. CHECK_SSL_VALIDITY,
  35. COOKIES_FILE,
  36. CURL_VERSION,
  37. WGET_VERSION,
  38. CHROME_VERSION,
  39. GIT_VERSION,
  40. YOUTUBEDL_VERSION,
  41. )
  42. from util import (
  43. enforce_types,
  44. domain,
  45. extension,
  46. without_query,
  47. without_fragment,
  48. fetch_page_title,
  49. is_static_file,
  50. TimedProgress,
  51. chmod_file,
  52. wget_output_path,
  53. chrome_args,
  54. run, PIPE, DEVNULL,
  55. Link,
  56. )
  57. from logs import (
  58. log_link_archiving_started,
  59. log_link_archiving_finished,
  60. log_archive_method_started,
  61. log_archive_method_finished,
  62. )
  63. @enforce_types
  64. def archive_link(link: Link, page=None) -> Link:
  65. """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
  66. ARCHIVE_METHODS = (
  67. ('title', should_fetch_title, fetch_title),
  68. ('favicon', should_fetch_favicon, fetch_favicon),
  69. ('wget', should_fetch_wget, fetch_wget),
  70. ('pdf', should_fetch_pdf, fetch_pdf),
  71. ('screenshot', should_fetch_screenshot, fetch_screenshot),
  72. ('dom', should_fetch_dom, fetch_dom),
  73. ('git', should_fetch_git, fetch_git),
  74. ('media', should_fetch_media, fetch_media),
  75. ('archive_org', should_fetch_archive_dot_org, archive_dot_org),
  76. )
  77. try:
  78. is_new = not os.path.exists(link.link_dir)
  79. if is_new:
  80. os.makedirs(link.link_dir)
  81. link = load_json_link_index(link.link_dir, link)
  82. log_link_archiving_started(link.link_dir, link, is_new)
  83. stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
  84. for method_name, should_run, method_function in ARCHIVE_METHODS:
  85. if method_name not in link.history:
  86. link.history[method_name] = []
  87. if should_run(link.link_dir, link):
  88. log_archive_method_started(method_name)
  89. result = method_function(link.link_dir, link)
  90. link.history[method_name].append(result)
  91. stats[result.status] += 1
  92. log_archive_method_finished(result)
  93. else:
  94. stats['skipped'] += 1
  95. # print(' ', stats)
  96. link = Link(**{
  97. **link._asdict(),
  98. 'updated': datetime.now(),
  99. })
  100. write_link_index(link.link_dir, link)
  101. patch_links_index(link)
  102. log_link_archiving_finished(link.link_dir, link, is_new, stats)
  103. except KeyboardInterrupt:
  104. raise
  105. except Exception as err:
  106. print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
  107. raise
  108. return link
  109. ### Archive Method Functions
  110. @enforce_types
  111. def should_fetch_title(link_dir: str, link: Link) -> bool:
  112. # if link already has valid title, skip it
  113. if link.title and not link.title.lower().startswith('http'):
  114. return False
  115. if is_static_file(link.url):
  116. return False
  117. return FETCH_TITLE
  118. @enforce_types
  119. def fetch_title(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
  120. """try to guess the page's title from its content"""
  121. output = None
  122. cmd = [
  123. CURL_BINARY,
  124. link.url,
  125. '|',
  126. 'grep',
  127. '<title>',
  128. ]
  129. status = 'succeeded'
  130. timer = TimedProgress(timeout, prefix=' ')
  131. try:
  132. output = fetch_page_title(link.url, timeout=timeout, progress=False)
  133. if not output:
  134. raise ArchiveError('Unable to detect page title')
  135. except Exception as err:
  136. status = 'failed'
  137. output = err
  138. finally:
  139. timer.end()
  140. return ArchiveResult(
  141. cmd=cmd,
  142. pwd=link_dir,
  143. cmd_version=CURL_VERSION,
  144. output=output,
  145. status=status,
  146. **timer.stats,
  147. )
  148. @enforce_types
  149. def should_fetch_favicon(link_dir: str, link: Link) -> bool:
  150. if os.path.exists(os.path.join(link_dir, 'favicon.ico')):
  151. return False
  152. return FETCH_FAVICON
  153. @enforce_types
  154. def fetch_favicon(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
  155. """download site favicon from google's favicon api"""
  156. output = 'favicon.ico'
  157. cmd = [
  158. CURL_BINARY,
  159. '--max-time', str(timeout),
  160. '--location',
  161. '--output', output,
  162. *(() if CHECK_SSL_VALIDITY else ('--insecure',)),
  163. 'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)),
  164. ]
  165. status = 'succeeded'
  166. timer = TimedProgress(timeout, prefix=' ')
  167. try:
  168. run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
  169. chmod_file(output, cwd=link_dir)
  170. except Exception as err:
  171. status = 'failed'
  172. output = err
  173. finally:
  174. timer.end()
  175. return ArchiveResult(
  176. cmd=cmd,
  177. pwd=link_dir,
  178. cmd_version=CURL_VERSION,
  179. output=output,
  180. status=status,
  181. **timer.stats,
  182. )
  183. @enforce_types
  184. def should_fetch_wget(link_dir: str, link: Link) -> bool:
  185. output_path = wget_output_path(link)
  186. if output_path and os.path.exists(os.path.join(link_dir, output_path)):
  187. return False
  188. return FETCH_WGET
  189. @enforce_types
  190. def fetch_wget(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
  191. """download full site using wget"""
  192. if FETCH_WARC:
  193. warc_dir = os.path.join(link_dir, 'warc')
  194. os.makedirs(warc_dir, exist_ok=True)
  195. warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
  196. # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
  197. output = None
  198. cmd = [
  199. WGET_BINARY,
  200. # '--server-response', # print headers for better error parsing
  201. '--no-verbose',
  202. '--adjust-extension',
  203. '--convert-links',
  204. '--force-directories',
  205. '--backup-converted',
  206. '--span-hosts',
  207. '--no-parent',
  208. '--compression=auto',
  209. '-e', 'robots=off',
  210. '--restrict-file-names=unix',
  211. '--timeout={}'.format(timeout),
  212. *(() if FETCH_WARC else ('--timestamping',)),
  213. *(('--warc-file={}'.format(warc_path),) if FETCH_WARC else ()),
  214. *(('--page-requisites',) if FETCH_WGET_REQUISITES else ()),
  215. *(('--user-agent={}'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()),
  216. *(('--load-cookies', COOKIES_FILE) if COOKIES_FILE else ()),
  217. *((() if CHECK_SSL_VALIDITY else ('--no-check-certificate', '--no-hsts'))),
  218. link.url,
  219. ]
  220. status = 'succeeded'
  221. timer = TimedProgress(timeout, prefix=' ')
  222. try:
  223. result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
  224. output = wget_output_path(link)
  225. # parse out number of files downloaded from last line of stderr:
  226. # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
  227. output_tail = [
  228. line.strip()
  229. for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
  230. if line.strip()
  231. ]
  232. files_downloaded = (
  233. int(output_tail[-1].strip().split(' ', 2)[1] or 0)
  234. if 'Downloaded:' in output_tail[-1]
  235. else 0
  236. )
  237. # Check for common failure cases
  238. if result.returncode > 0 and files_downloaded < 1:
  239. hints = (
  240. 'Got wget response code: {}.'.format(result.returncode),
  241. *output_tail,
  242. )
  243. if b'403: Forbidden' in result.stderr:
  244. raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
  245. if b'404: Not Found' in result.stderr:
  246. raise ArchiveError('404 Not Found', hints)
  247. if b'ERROR 500: Internal Server Error' in result.stderr:
  248. raise ArchiveError('500 Internal Server Error', hints)
  249. raise ArchiveError('Got an error from the server', hints)
  250. except Exception as err:
  251. status = 'failed'
  252. output = err
  253. finally:
  254. timer.end()
  255. return ArchiveResult(
  256. cmd=cmd,
  257. pwd=link_dir,
  258. cmd_version=WGET_VERSION,
  259. output=output,
  260. status=status,
  261. **timer.stats,
  262. )
  263. @enforce_types
  264. def should_fetch_pdf(link_dir: str, link: Link) -> bool:
  265. if is_static_file(link.url):
  266. return False
  267. if os.path.exists(os.path.join(link_dir, 'output.pdf')):
  268. return False
  269. return FETCH_PDF
  270. @enforce_types
  271. def fetch_pdf(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
  272. """print PDF of site to file using chrome --headless"""
  273. output = 'output.pdf'
  274. cmd = [
  275. *chrome_args(TIMEOUT=timeout),
  276. '--print-to-pdf',
  277. link.url,
  278. ]
  279. status = 'succeeded'
  280. timer = TimedProgress(timeout, prefix=' ')
  281. try:
  282. result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
  283. if result.returncode:
  284. hints = (result.stderr or result.stdout).decode()
  285. raise ArchiveError('Failed to print PDF', hints)
  286. chmod_file('output.pdf', cwd=link_dir)
  287. except Exception as err:
  288. status = 'failed'
  289. output = err
  290. finally:
  291. timer.end()
  292. return ArchiveResult(
  293. cmd=cmd,
  294. pwd=link_dir,
  295. cmd_version=CHROME_VERSION,
  296. output=output,
  297. status=status,
  298. **timer.stats,
  299. )
  300. @enforce_types
  301. def should_fetch_screenshot(link_dir: str, link: Link) -> bool:
  302. if is_static_file(link.url):
  303. return False
  304. if os.path.exists(os.path.join(link_dir, 'screenshot.png')):
  305. return False
  306. return FETCH_SCREENSHOT
  307. @enforce_types
  308. def fetch_screenshot(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
  309. """take screenshot of site using chrome --headless"""
  310. output = 'screenshot.png'
  311. cmd = [
  312. *chrome_args(TIMEOUT=timeout),
  313. '--screenshot',
  314. link.url,
  315. ]
  316. status = 'succeeded'
  317. timer = TimedProgress(timeout, prefix=' ')
  318. try:
  319. result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
  320. if result.returncode:
  321. hints = (result.stderr or result.stdout).decode()
  322. raise ArchiveError('Failed to take screenshot', hints)
  323. chmod_file(output, cwd=link_dir)
  324. except Exception as err:
  325. status = 'failed'
  326. output = err
  327. finally:
  328. timer.end()
  329. return ArchiveResult(
  330. cmd=cmd,
  331. pwd=link_dir,
  332. cmd_version=CHROME_VERSION,
  333. output=output,
  334. status=status,
  335. **timer.stats,
  336. )
  337. @enforce_types
  338. def should_fetch_dom(link_dir: str, link: Link) -> bool:
  339. if is_static_file(link.url):
  340. return False
  341. if os.path.exists(os.path.join(link_dir, 'output.html')):
  342. return False
  343. return FETCH_DOM
  344. @enforce_types
  345. def fetch_dom(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
  346. """print HTML of site to file using chrome --dump-html"""
  347. output = 'output.html'
  348. output_path = os.path.join(link_dir, output)
  349. cmd = [
  350. *chrome_args(TIMEOUT=timeout),
  351. '--dump-dom',
  352. link.url
  353. ]
  354. status = 'succeeded'
  355. timer = TimedProgress(timeout, prefix=' ')
  356. try:
  357. with open(output_path, 'w+') as f:
  358. result = run(cmd, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout)
  359. if result.returncode:
  360. hints = result.stderr.decode()
  361. raise ArchiveError('Failed to fetch DOM', hints)
  362. chmod_file(output, cwd=link_dir)
  363. except Exception as err:
  364. status = 'failed'
  365. output = err
  366. finally:
  367. timer.end()
  368. return ArchiveResult(
  369. cmd=cmd,
  370. pwd=link_dir,
  371. cmd_version=CHROME_VERSION,
  372. output=output,
  373. status=status,
  374. **timer.stats,
  375. )
  376. @enforce_types
  377. def should_fetch_git(link_dir: str, link: Link) -> bool:
  378. if is_static_file(link.url):
  379. return False
  380. if os.path.exists(os.path.join(link_dir, 'git')):
  381. return False
  382. is_clonable_url = (
  383. (domain(link.url) in GIT_DOMAINS)
  384. or (extension(link.url) == 'git')
  385. )
  386. if not is_clonable_url:
  387. return False
  388. return FETCH_GIT
  389. @enforce_types
  390. def fetch_git(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
  391. """download full site using git"""
  392. output = 'git'
  393. output_path = os.path.join(link_dir, 'git')
  394. os.makedirs(output_path, exist_ok=True)
  395. cmd = [
  396. GIT_BINARY,
  397. 'clone',
  398. '--mirror',
  399. '--recursive',
  400. *(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')),
  401. without_query(without_fragment(link.url)),
  402. ]
  403. status = 'succeeded'
  404. timer = TimedProgress(timeout, prefix=' ')
  405. try:
  406. result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
  407. if result.returncode == 128:
  408. # ignore failed re-download when the folder already exists
  409. pass
  410. elif result.returncode > 0:
  411. hints = 'Got git response code: {}.'.format(result.returncode)
  412. raise ArchiveError('Failed git download', hints)
  413. except Exception as err:
  414. status = 'failed'
  415. output = err
  416. finally:
  417. timer.end()
  418. return ArchiveResult(
  419. cmd=cmd,
  420. pwd=link_dir,
  421. cmd_version=GIT_VERSION,
  422. output=output,
  423. status=status,
  424. **timer.stats,
  425. )
  426. @enforce_types
  427. def should_fetch_media(link_dir: str, link: Link) -> bool:
  428. if is_static_file(link.url):
  429. return False
  430. if os.path.exists(os.path.join(link_dir, 'media')):
  431. return False
  432. return FETCH_MEDIA
  433. @enforce_types
  434. def fetch_media(link_dir: str, link: Link, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
  435. """Download playlists or individual video, audio, and subtitles using youtube-dl"""
  436. output = 'media'
  437. output_path = os.path.join(link_dir, 'media')
  438. os.makedirs(output_path, exist_ok=True)
  439. cmd = [
  440. YOUTUBEDL_BINARY,
  441. '--write-description',
  442. '--write-info-json',
  443. '--write-annotations',
  444. '--yes-playlist',
  445. '--write-thumbnail',
  446. '--no-call-home',
  447. '--no-check-certificate',
  448. '--user-agent',
  449. '--all-subs',
  450. '--extract-audio',
  451. '--keep-video',
  452. '--ignore-errors',
  453. '--geo-bypass',
  454. '--audio-format', 'mp3',
  455. '--audio-quality', '320K',
  456. '--embed-thumbnail',
  457. '--add-metadata',
  458. *(() if CHECK_SSL_VALIDITY else ('--no-check-certificate',)),
  459. link.url,
  460. ]
  461. status = 'succeeded'
  462. timer = TimedProgress(timeout, prefix=' ')
  463. try:
  464. result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
  465. chmod_file(output, cwd=link_dir)
  466. if result.returncode:
  467. if (b'ERROR: Unsupported URL' in result.stderr
  468. or b'HTTP Error 404' in result.stderr
  469. or b'HTTP Error 403' in result.stderr
  470. or b'URL could be a direct video link' in result.stderr
  471. or b'Unable to extract container ID' in result.stderr):
  472. # These happen too frequently on non-media pages to warrant printing to console
  473. pass
  474. else:
  475. hints = (
  476. 'Got youtube-dl response code: {}.'.format(result.returncode),
  477. *result.stderr.decode().split('\n'),
  478. )
  479. raise ArchiveError('Failed to download media', hints)
  480. except Exception as err:
  481. status = 'failed'
  482. output = err
  483. finally:
  484. timer.end()
  485. return ArchiveResult(
  486. cmd=cmd,
  487. pwd=link_dir,
  488. cmd_version=YOUTUBEDL_VERSION,
  489. output=output,
  490. status=status,
  491. **timer.stats,
  492. )
  493. @enforce_types
  494. def should_fetch_archive_dot_org(link_dir: str, link: Link) -> bool:
  495. if is_static_file(link.url):
  496. return False
  497. if os.path.exists(os.path.join(link_dir, 'archive.org.txt')):
  498. # if open(path, 'r').read().strip() != 'None':
  499. return False
  500. return SUBMIT_ARCHIVE_DOT_ORG
  501. @enforce_types
  502. def archive_dot_org(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
  503. """submit site to archive.org for archiving via their service, save returned archive url"""
  504. output = 'archive.org.txt'
  505. archive_org_url = None
  506. submit_url = 'https://web.archive.org/save/{}'.format(link.url)
  507. cmd = [
  508. CURL_BINARY,
  509. '--location',
  510. '--head',
  511. '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(GIT_SHA), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
  512. '--max-time', str(timeout),
  513. *(() if CHECK_SSL_VALIDITY else ('--insecure',)),
  514. submit_url,
  515. ]
  516. status = 'succeeded'
  517. timer = TimedProgress(timeout, prefix=' ')
  518. try:
  519. result = run(cmd, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout)
  520. content_location, errors = parse_archive_dot_org_response(result.stdout)
  521. if content_location:
  522. archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
  523. elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
  524. archive_org_url = None
  525. # raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url)))
  526. elif errors:
  527. raise ArchiveError(', '.join(errors))
  528. else:
  529. raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.')
  530. except Exception as err:
  531. status = 'failed'
  532. output = err
  533. finally:
  534. timer.end()
  535. if not isinstance(output, Exception):
  536. # instead of writing None when archive.org rejects the url write the
  537. # url to resubmit it to archive.org. This is so when the user visits
  538. # the URL in person, it will attempt to re-archive it, and it'll show the
  539. # nicer error message explaining why the url was rejected if it fails.
  540. archive_org_url = archive_org_url or submit_url
  541. with open(os.path.join(link_dir, output), 'w', encoding='utf-8') as f:
  542. f.write(archive_org_url)
  543. chmod_file('archive.org.txt', cwd=link_dir)
  544. output = archive_org_url
  545. return ArchiveResult(
  546. cmd=cmd,
  547. pwd=link_dir,
  548. cmd_version=CURL_VERSION,
  549. output=output,
  550. status=status,
  551. **timer.stats,
  552. )
  553. @enforce_types
  554. def parse_archive_dot_org_response(response: bytes) -> Tuple[List[str], List[str]]:
  555. # Parse archive.org response headers
  556. headers: Dict[str, List[str]] = defaultdict(list)
  557. # lowercase all the header names and store in dict
  558. for header in response.splitlines():
  559. if b':' not in header or not header.strip():
  560. continue
  561. name, val = header.decode().split(':', 1)
  562. headers[name.lower().strip()].append(val.strip())
  563. # Get successful archive url in "content-location" header or any errors
  564. content_location = headers['content-location']
  565. errors = headers['x-archive-wayback-runtime-error']
  566. return content_location, errors