2
0

archive_methods.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634
  1. import os
  2. from typing import Dict, List, Tuple
  3. from collections import defaultdict
  4. from datetime import datetime
  5. from schema import Link, ArchiveResult, ArchiveError
  6. from index import (
  7. write_link_index,
  8. patch_links_index,
  9. load_json_link_index,
  10. )
  11. from config import (
  12. CURL_BINARY,
  13. GIT_BINARY,
  14. WGET_BINARY,
  15. YOUTUBEDL_BINARY,
  16. FETCH_FAVICON,
  17. FETCH_TITLE,
  18. FETCH_WGET,
  19. FETCH_WGET_REQUISITES,
  20. FETCH_PDF,
  21. FETCH_SCREENSHOT,
  22. FETCH_DOM,
  23. FETCH_WARC,
  24. FETCH_GIT,
  25. FETCH_MEDIA,
  26. SUBMIT_ARCHIVE_DOT_ORG,
  27. TIMEOUT,
  28. MEDIA_TIMEOUT,
  29. ANSI,
  30. OUTPUT_DIR,
  31. GIT_DOMAINS,
  32. GIT_SHA,
  33. WGET_USER_AGENT,
  34. CHECK_SSL_VALIDITY,
  35. COOKIES_FILE,
  36. CURL_VERSION,
  37. WGET_VERSION,
  38. CHROME_VERSION,
  39. GIT_VERSION,
  40. YOUTUBEDL_VERSION,
  41. )
  42. from util import (
  43. domain,
  44. extension,
  45. without_query,
  46. without_fragment,
  47. fetch_page_title,
  48. is_static_file,
  49. TimedProgress,
  50. chmod_file,
  51. wget_output_path,
  52. chrome_args,
  53. check_link_structure,
  54. run, PIPE, DEVNULL,
  55. Link,
  56. )
  57. from logs import (
  58. log_link_archiving_started,
  59. log_link_archiving_finished,
  60. log_archive_method_started,
  61. log_archive_method_finished,
  62. )
  63. def archive_link(link_dir: str, link: Link, page=None) -> Link:
  64. """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
  65. ARCHIVE_METHODS = (
  66. ('title', should_fetch_title, fetch_title),
  67. ('favicon', should_fetch_favicon, fetch_favicon),
  68. ('wget', should_fetch_wget, fetch_wget),
  69. ('pdf', should_fetch_pdf, fetch_pdf),
  70. ('screenshot', should_fetch_screenshot, fetch_screenshot),
  71. ('dom', should_fetch_dom, fetch_dom),
  72. ('git', should_fetch_git, fetch_git),
  73. ('media', should_fetch_media, fetch_media),
  74. ('archive_org', should_fetch_archive_dot_org, archive_dot_org),
  75. )
  76. try:
  77. is_new = not os.path.exists(link_dir)
  78. if is_new:
  79. os.makedirs(link_dir)
  80. link = load_json_link_index(link_dir, link)
  81. log_link_archiving_started(link_dir, link, is_new)
  82. stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
  83. for method_name, should_run, method_function in ARCHIVE_METHODS:
  84. if method_name not in link['history']:
  85. link['history'][method_name] = []
  86. if should_run(link_dir, link):
  87. log_archive_method_started(method_name)
  88. result = method_function(link_dir, link)
  89. link['history'][method_name].append(result._asdict())
  90. stats[result.status] += 1
  91. log_archive_method_finished(result)
  92. else:
  93. stats['skipped'] += 1
  94. # print(' ', stats)
  95. write_link_index(link_dir, link)
  96. patch_links_index(link)
  97. log_link_archiving_finished(link_dir, link, is_new, stats)
  98. except Exception as err:
  99. print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
  100. raise
  101. return link
  102. ### Archive Method Functions
  103. def should_fetch_title(link_dir: str, link: Link) -> bool:
  104. # if link already has valid title, skip it
  105. if link['title'] and not link['title'].lower().startswith('http'):
  106. return False
  107. if is_static_file(link['url']):
  108. return False
  109. return FETCH_TITLE
  110. def fetch_title(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
  111. """try to guess the page's title from its content"""
  112. output = None
  113. cmd = [
  114. CURL_BINARY,
  115. link['url'],
  116. '|',
  117. 'grep',
  118. '<title>',
  119. ]
  120. status = 'succeeded'
  121. timer = TimedProgress(timeout, prefix=' ')
  122. try:
  123. output = fetch_page_title(link['url'], timeout=timeout, progress=False)
  124. if not output:
  125. raise ArchiveError('Unable to detect page title')
  126. except Exception as err:
  127. status = 'failed'
  128. output = err
  129. finally:
  130. timer.end()
  131. return ArchiveResult(
  132. cmd=cmd,
  133. pwd=link_dir,
  134. cmd_version=CURL_VERSION,
  135. output=output,
  136. status=status,
  137. **timer.stats,
  138. )
  139. def should_fetch_favicon(link_dir: str, link: Link) -> bool:
  140. if os.path.exists(os.path.join(link_dir, 'favicon.ico')):
  141. return False
  142. return FETCH_FAVICON
  143. def fetch_favicon(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
  144. """download site favicon from google's favicon api"""
  145. output = 'favicon.ico'
  146. cmd = [
  147. CURL_BINARY,
  148. '--max-time', str(timeout),
  149. '--location',
  150. '--output', output,
  151. *(() if CHECK_SSL_VALIDITY else ('--insecure',)),
  152. 'https://www.google.com/s2/favicons?domain={}'.format(domain(link['url'])),
  153. ]
  154. status = 'succeeded'
  155. timer = TimedProgress(timeout, prefix=' ')
  156. try:
  157. run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
  158. chmod_file(output, cwd=link_dir)
  159. except Exception as err:
  160. status = 'failed'
  161. output = err
  162. finally:
  163. timer.end()
  164. return ArchiveResult(
  165. cmd=cmd,
  166. pwd=link_dir,
  167. cmd_version=CURL_VERSION,
  168. output=output,
  169. status=status,
  170. **timer.stats,
  171. )
  172. def should_fetch_wget(link_dir: str, link: Link) -> bool:
  173. output_path = wget_output_path(link)
  174. if output_path and os.path.exists(os.path.join(link_dir, output_path)):
  175. return False
  176. return FETCH_WGET
  177. def fetch_wget(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
  178. """download full site using wget"""
  179. if FETCH_WARC:
  180. warc_dir = os.path.join(link_dir, 'warc')
  181. os.makedirs(warc_dir, exist_ok=True)
  182. warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
  183. # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
  184. output = None
  185. cmd = [
  186. WGET_BINARY,
  187. # '--server-response', # print headers for better error parsing
  188. '--no-verbose',
  189. '--adjust-extension',
  190. '--convert-links',
  191. '--force-directories',
  192. '--backup-converted',
  193. '--span-hosts',
  194. '--no-parent',
  195. '--compression=auto',
  196. '-e', 'robots=off',
  197. '--restrict-file-names=unix',
  198. '--timeout={}'.format(timeout),
  199. *(() if FETCH_WARC else ('--timestamping',)),
  200. *(('--warc-file={}'.format(warc_path),) if FETCH_WARC else ()),
  201. *(('--page-requisites',) if FETCH_WGET_REQUISITES else ()),
  202. *(('--user-agent={}'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()),
  203. *(('--load-cookies', COOKIES_FILE) if COOKIES_FILE else ()),
  204. *((() if CHECK_SSL_VALIDITY else ('--no-check-certificate', '--no-hsts'))),
  205. link['url'],
  206. ]
  207. status = 'succeeded'
  208. timer = TimedProgress(timeout, prefix=' ')
  209. try:
  210. result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
  211. output = wget_output_path(link)
  212. # parse out number of files downloaded from last line of stderr:
  213. # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
  214. output_tail = [
  215. line.strip()
  216. for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
  217. if line.strip()
  218. ]
  219. files_downloaded = (
  220. int(output_tail[-1].strip().split(' ', 2)[1] or 0)
  221. if 'Downloaded:' in output_tail[-1]
  222. else 0
  223. )
  224. # Check for common failure cases
  225. if result.returncode > 0 and files_downloaded < 1:
  226. hints = (
  227. 'Got wget response code: {}.'.format(result.returncode),
  228. *output_tail,
  229. )
  230. if b'403: Forbidden' in result.stderr:
  231. raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
  232. if b'404: Not Found' in result.stderr:
  233. raise ArchiveError('404 Not Found', hints)
  234. if b'ERROR 500: Internal Server Error' in result.stderr:
  235. raise ArchiveError('500 Internal Server Error', hints)
  236. raise ArchiveError('Got an error from the server', hints)
  237. except Exception as err:
  238. status = 'failed'
  239. output = err
  240. finally:
  241. timer.end()
  242. return ArchiveResult(
  243. cmd=cmd,
  244. pwd=link_dir,
  245. cmd_version=WGET_VERSION,
  246. output=output,
  247. status=status,
  248. **timer.stats,
  249. )
  250. def should_fetch_pdf(link_dir: str, link: Link) -> bool:
  251. if is_static_file(link['url']):
  252. return False
  253. if os.path.exists(os.path.join(link_dir, 'output.pdf')):
  254. return False
  255. return FETCH_PDF
  256. def fetch_pdf(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
  257. """print PDF of site to file using chrome --headless"""
  258. output = 'output.pdf'
  259. cmd = [
  260. *chrome_args(TIMEOUT=timeout),
  261. '--print-to-pdf',
  262. link['url'],
  263. ]
  264. status = 'succeeded'
  265. timer = TimedProgress(timeout, prefix=' ')
  266. try:
  267. result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
  268. if result.returncode:
  269. hints = (result.stderr or result.stdout).decode()
  270. raise ArchiveError('Failed to print PDF', hints)
  271. chmod_file('output.pdf', cwd=link_dir)
  272. except Exception as err:
  273. status = 'failed'
  274. output = err
  275. finally:
  276. timer.end()
  277. return ArchiveResult(
  278. cmd=cmd,
  279. pwd=link_dir,
  280. cmd_version=CHROME_VERSION,
  281. output=output,
  282. status=status,
  283. **timer.stats,
  284. )
  285. def should_fetch_screenshot(link_dir: str, link: Link) -> bool:
  286. if is_static_file(link['url']):
  287. return False
  288. if os.path.exists(os.path.join(link_dir, 'screenshot.png')):
  289. return False
  290. return FETCH_SCREENSHOT
  291. def fetch_screenshot(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
  292. """take screenshot of site using chrome --headless"""
  293. output = 'screenshot.png'
  294. cmd = [
  295. *chrome_args(TIMEOUT=timeout),
  296. '--screenshot',
  297. link['url'],
  298. ]
  299. status = 'succeeded'
  300. timer = TimedProgress(timeout, prefix=' ')
  301. try:
  302. result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
  303. if result.returncode:
  304. hints = (result.stderr or result.stdout).decode()
  305. raise ArchiveError('Failed to take screenshot', hints)
  306. chmod_file(output, cwd=link_dir)
  307. except Exception as err:
  308. status = 'failed'
  309. output = err
  310. finally:
  311. timer.end()
  312. return ArchiveResult(
  313. cmd=cmd,
  314. pwd=link_dir,
  315. cmd_version=CHROME_VERSION,
  316. output=output,
  317. status=status,
  318. **timer.stats,
  319. )
  320. def should_fetch_dom(link_dir: str, link: Link) -> bool:
  321. if is_static_file(link['url']):
  322. return False
  323. if os.path.exists(os.path.join(link_dir, 'output.html')):
  324. return False
  325. return FETCH_DOM
  326. def fetch_dom(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
  327. """print HTML of site to file using chrome --dump-html"""
  328. output = 'output.html'
  329. output_path = os.path.join(link_dir, output)
  330. cmd = [
  331. *chrome_args(TIMEOUT=timeout),
  332. '--dump-dom',
  333. link['url']
  334. ]
  335. status = 'succeeded'
  336. timer = TimedProgress(timeout, prefix=' ')
  337. try:
  338. with open(output_path, 'w+') as f:
  339. result = run(cmd, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout)
  340. if result.returncode:
  341. hints = result.stderr.decode()
  342. raise ArchiveError('Failed to fetch DOM', hints)
  343. chmod_file(output, cwd=link_dir)
  344. except Exception as err:
  345. status = 'failed'
  346. output = err
  347. finally:
  348. timer.end()
  349. return ArchiveResult(
  350. cmd=cmd,
  351. pwd=link_dir,
  352. cmd_version=CHROME_VERSION,
  353. output=output,
  354. status=status,
  355. **timer.stats,
  356. )
  357. def should_fetch_git(link_dir: str, link: Link) -> bool:
  358. if is_static_file(link['url']):
  359. return False
  360. if os.path.exists(os.path.join(link_dir, 'git')):
  361. return False
  362. is_clonable_url = (
  363. (domain(link['url']) in GIT_DOMAINS)
  364. or (extension(link['url']) == 'git')
  365. )
  366. if not is_clonable_url:
  367. return False
  368. return FETCH_GIT
  369. def fetch_git(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
  370. """download full site using git"""
  371. output = 'git'
  372. output_path = os.path.join(link_dir, 'git')
  373. os.makedirs(output_path, exist_ok=True)
  374. cmd = [
  375. GIT_BINARY,
  376. 'clone',
  377. '--mirror',
  378. '--recursive',
  379. *(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')),
  380. without_query(without_fragment(link['url'])),
  381. ]
  382. status = 'succeeded'
  383. timer = TimedProgress(timeout, prefix=' ')
  384. try:
  385. result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
  386. if result.returncode == 128:
  387. # ignore failed re-download when the folder already exists
  388. pass
  389. elif result.returncode > 0:
  390. hints = 'Got git response code: {}.'.format(result.returncode)
  391. raise ArchiveError('Failed git download', hints)
  392. except Exception as err:
  393. status = 'failed'
  394. output = err
  395. finally:
  396. timer.end()
  397. return ArchiveResult(
  398. cmd=cmd,
  399. pwd=link_dir,
  400. cmd_version=GIT_VERSION,
  401. output=output,
  402. status=status,
  403. **timer.stats,
  404. )
  405. def should_fetch_media(link_dir: str, link: Link) -> bool:
  406. if is_static_file(link['url']):
  407. return False
  408. if os.path.exists(os.path.join(link_dir, 'media')):
  409. return False
  410. return FETCH_MEDIA
  411. def fetch_media(link_dir: str, link: Link, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
  412. """Download playlists or individual video, audio, and subtitles using youtube-dl"""
  413. output = 'media'
  414. output_path = os.path.join(link_dir, 'media')
  415. os.makedirs(output_path, exist_ok=True)
  416. cmd = [
  417. YOUTUBEDL_BINARY,
  418. '--write-description',
  419. '--write-info-json',
  420. '--write-annotations',
  421. '--yes-playlist',
  422. '--write-thumbnail',
  423. '--no-call-home',
  424. '--no-check-certificate',
  425. '--user-agent',
  426. '--all-subs',
  427. '--extract-audio',
  428. '--keep-video',
  429. '--ignore-errors',
  430. '--geo-bypass',
  431. '--audio-format', 'mp3',
  432. '--audio-quality', '320K',
  433. '--embed-thumbnail',
  434. '--add-metadata',
  435. *(() if CHECK_SSL_VALIDITY else ('--no-check-certificate',)),
  436. link['url'],
  437. ]
  438. status = 'succeeded'
  439. timer = TimedProgress(timeout, prefix=' ')
  440. try:
  441. result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
  442. chmod_file(output, cwd=link_dir)
  443. if result.returncode:
  444. if (b'ERROR: Unsupported URL' in result.stderr
  445. or b'HTTP Error 404' in result.stderr
  446. or b'HTTP Error 403' in result.stderr
  447. or b'URL could be a direct video link' in result.stderr
  448. or b'Unable to extract container ID' in result.stderr):
  449. # These happen too frequently on non-media pages to warrant printing to console
  450. pass
  451. else:
  452. hints = (
  453. 'Got youtube-dl response code: {}.'.format(result.returncode),
  454. *result.stderr.decode().split('\n'),
  455. )
  456. raise ArchiveError('Failed to download media', hints)
  457. except Exception as err:
  458. status = 'failed'
  459. output = err
  460. finally:
  461. timer.end()
  462. return ArchiveResult(
  463. cmd=cmd,
  464. pwd=link_dir,
  465. cmd_version=YOUTUBEDL_VERSION,
  466. output=output,
  467. status=status,
  468. **timer.stats,
  469. )
  470. def should_fetch_archive_dot_org(link_dir: str, link: Link) -> bool:
  471. if is_static_file(link['url']):
  472. return False
  473. if os.path.exists(os.path.join(link_dir, 'archive.org.txt')):
  474. # if open(path, 'r').read().strip() != 'None':
  475. return False
  476. return SUBMIT_ARCHIVE_DOT_ORG
  477. def archive_dot_org(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
  478. """submit site to archive.org for archiving via their service, save returned archive url"""
  479. output = 'archive.org.txt'
  480. archive_org_url = None
  481. submit_url = 'https://web.archive.org/save/{}'.format(link['url'])
  482. cmd = [
  483. CURL_BINARY,
  484. '--location',
  485. '--head',
  486. '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(GIT_SHA), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
  487. '--max-time', str(timeout),
  488. *(() if CHECK_SSL_VALIDITY else ('--insecure',)),
  489. submit_url,
  490. ]
  491. status = 'succeeded'
  492. timer = TimedProgress(timeout, prefix=' ')
  493. try:
  494. result = run(cmd, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout)
  495. content_location, errors = parse_archive_dot_org_response(result.stdout)
  496. if content_location:
  497. archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
  498. elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
  499. archive_org_url = None
  500. # raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link['url'])))
  501. elif errors:
  502. raise ArchiveError(', '.join(errors))
  503. else:
  504. raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.')
  505. except Exception as err:
  506. status = 'failed'
  507. output = err
  508. finally:
  509. timer.end()
  510. if not isinstance(output, Exception):
  511. # instead of writing None when archive.org rejects the url write the
  512. # url to resubmit it to archive.org. This is so when the user visits
  513. # the URL in person, it will attempt to re-archive it, and it'll show the
  514. # nicer error message explaining why the url was rejected if it fails.
  515. archive_org_url = archive_org_url or submit_url
  516. with open(os.path.join(link_dir, output), 'w', encoding='utf-8') as f:
  517. f.write(archive_org_url)
  518. chmod_file('archive.org.txt', cwd=link_dir)
  519. output = archive_org_url
  520. return ArchiveResult(
  521. cmd=cmd,
  522. pwd=link_dir,
  523. cmd_version=CURL_VERSION,
  524. output=output,
  525. status=status,
  526. **timer.stats,
  527. )
  528. def parse_archive_dot_org_response(response: bytes) -> Tuple[List[str], List[str]]:
  529. # Parse archive.org response headers
  530. headers: Dict[str, List[str]] = defaultdict(list)
  531. # lowercase all the header names and store in dict
  532. for header in response.splitlines():
  533. if b':' not in header or not header.strip():
  534. continue
  535. name, val = header.decode().split(':', 1)
  536. headers[name.lower().strip()].append(val.strip())
  537. # Get successful archive url in "content-location" header or any errors
  538. content_location = headers['content-location']
  539. errors = headers['x-archive-wayback-runtime-error']
  540. return content_location, errors