archive_methods.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694
  1. import os
  2. from typing import Dict, List, Tuple, Optional
  3. from collections import defaultdict
  4. from datetime import datetime
  5. from .schema import Link, ArchiveResult, ArchiveOutput
  6. from .index import (
  7. load_link_details,
  8. write_link_details,
  9. patch_main_index,
  10. )
  11. from .config import (
  12. CURL_BINARY,
  13. GIT_BINARY,
  14. WGET_BINARY,
  15. YOUTUBEDL_BINARY,
  16. SAVE_FAVICON,
  17. SAVE_TITLE,
  18. SAVE_WGET,
  19. SAVE_WGET_REQUISITES,
  20. SAVE_PDF,
  21. SAVE_SCREENSHOT,
  22. SAVE_DOM,
  23. SAVE_WARC,
  24. SAVE_GIT,
  25. SAVE_MEDIA,
  26. SAVE_ARCHIVE_DOT_ORG,
  27. TIMEOUT,
  28. MEDIA_TIMEOUT,
  29. GIT_DOMAINS,
  30. VERSION,
  31. WGET_USER_AGENT,
  32. CHECK_SSL_VALIDITY,
  33. COOKIES_FILE,
  34. CURL_VERSION,
  35. WGET_VERSION,
  36. CHROME_VERSION,
  37. GIT_VERSION,
  38. YOUTUBEDL_VERSION,
  39. WGET_AUTO_COMPRESSION,
  40. )
  41. from .util import (
  42. enforce_types,
  43. domain,
  44. extension,
  45. without_query,
  46. without_fragment,
  47. fetch_page_title,
  48. is_static_file,
  49. TimedProgress,
  50. chmod_file,
  51. wget_output_path,
  52. chrome_args,
  53. run, PIPE, DEVNULL,
  54. )
  55. from .logs import (
  56. log_link_archiving_started,
  57. log_link_archiving_finished,
  58. log_archive_method_started,
  59. log_archive_method_finished,
  60. )
  61. class ArchiveError(Exception):
  62. def __init__(self, message, hints=None):
  63. super().__init__(message)
  64. self.hints = hints
  65. @enforce_types
  66. def archive_link(link: Link, out_dir: Optional[str]=None) -> Link:
  67. """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
  68. ARCHIVE_METHODS = (
  69. ('title', should_save_title, save_title),
  70. ('favicon', should_save_favicon, save_favicon),
  71. ('wget', should_save_wget, save_wget),
  72. ('pdf', should_save_pdf, save_pdf),
  73. ('screenshot', should_save_screenshot, save_screenshot),
  74. ('dom', should_save_dom, save_dom),
  75. ('git', should_save_git, save_git),
  76. ('media', should_save_media, save_media),
  77. ('archive_org', should_save_archive_dot_org, save_archive_dot_org),
  78. )
  79. out_dir = out_dir or link.link_dir
  80. try:
  81. is_new = not os.path.exists(out_dir)
  82. if is_new:
  83. os.makedirs(out_dir)
  84. link = load_link_details(link, out_dir=out_dir)
  85. log_link_archiving_started(link, out_dir, is_new)
  86. link = link.overwrite(updated=datetime.now())
  87. stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
  88. for method_name, should_run, method_function in ARCHIVE_METHODS:
  89. try:
  90. if method_name not in link.history:
  91. link.history[method_name] = []
  92. if should_run(link, out_dir):
  93. log_archive_method_started(method_name)
  94. result = method_function(link=link, out_dir=out_dir)
  95. link.history[method_name].append(result)
  96. stats[result.status] += 1
  97. log_archive_method_finished(result)
  98. else:
  99. stats['skipped'] += 1
  100. except Exception as e:
  101. raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
  102. method_name,
  103. link.url,
  104. )) from e
  105. # print(' ', stats)
  106. write_link_details(link, out_dir=link.link_dir)
  107. patch_main_index(link)
  108. # # If any changes were made, update the main links index json and html
  109. # was_changed = stats['succeeded'] or stats['failed']
  110. # if was_changed:
  111. # patch_main_index(link)
  112. log_link_archiving_finished(link, link.link_dir, is_new, stats)
  113. except KeyboardInterrupt:
  114. try:
  115. write_link_details(link, out_dir=link.link_dir)
  116. except:
  117. pass
  118. raise
  119. except Exception as err:
  120. print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
  121. raise
  122. return link
  123. ### Archive Method Functions
  124. @enforce_types
  125. def should_save_title(link: Link, out_dir: Optional[str]=None) -> bool:
  126. # if link already has valid title, skip it
  127. if link.title and not link.title.lower().startswith('http'):
  128. return False
  129. if is_static_file(link.url):
  130. return False
  131. return SAVE_TITLE
  132. @enforce_types
  133. def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  134. """try to guess the page's title from its content"""
  135. output: ArchiveOutput = None
  136. cmd = [
  137. CURL_BINARY,
  138. link.url,
  139. '|',
  140. 'grep',
  141. '<title',
  142. ]
  143. status = 'succeeded'
  144. timer = TimedProgress(timeout, prefix=' ')
  145. try:
  146. output = fetch_page_title(link.url, timeout=timeout, progress=False)
  147. if not output:
  148. raise ArchiveError('Unable to detect page title')
  149. except Exception as err:
  150. status = 'failed'
  151. output = err
  152. finally:
  153. timer.end()
  154. return ArchiveResult(
  155. cmd=cmd,
  156. pwd=out_dir,
  157. cmd_version=CURL_VERSION,
  158. output=output,
  159. status=status,
  160. **timer.stats,
  161. )
  162. @enforce_types
  163. def should_save_favicon(link: Link, out_dir: Optional[str]=None) -> bool:
  164. out_dir = out_dir or link.link_dir
  165. if os.path.exists(os.path.join(out_dir, 'favicon.ico')):
  166. return False
  167. return SAVE_FAVICON
  168. @enforce_types
  169. def save_favicon(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  170. """download site favicon from google's favicon api"""
  171. out_dir = out_dir or link.link_dir
  172. output: ArchiveOutput = 'favicon.ico'
  173. cmd = [
  174. CURL_BINARY,
  175. '--max-time', str(timeout),
  176. '--location',
  177. '--output', str(output),
  178. *([] if CHECK_SSL_VALIDITY else ['--insecure']),
  179. 'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)),
  180. ]
  181. status = 'succeeded'
  182. timer = TimedProgress(timeout, prefix=' ')
  183. try:
  184. run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
  185. chmod_file(output, cwd=out_dir)
  186. except Exception as err:
  187. status = 'failed'
  188. output = err
  189. finally:
  190. timer.end()
  191. return ArchiveResult(
  192. cmd=cmd,
  193. pwd=out_dir,
  194. cmd_version=CURL_VERSION,
  195. output=output,
  196. status=status,
  197. **timer.stats,
  198. )
  199. @enforce_types
  200. def should_save_wget(link: Link, out_dir: Optional[str]=None) -> bool:
  201. output_path = wget_output_path(link)
  202. out_dir = out_dir or link.link_dir
  203. if output_path and os.path.exists(os.path.join(out_dir, output_path)):
  204. return False
  205. return SAVE_WGET
  206. @enforce_types
  207. def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  208. """download full site using wget"""
  209. out_dir = out_dir or link.link_dir
  210. if SAVE_WARC:
  211. warc_dir = os.path.join(out_dir, 'warc')
  212. os.makedirs(warc_dir, exist_ok=True)
  213. warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
  214. # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
  215. output: ArchiveOutput = None
  216. cmd = [
  217. WGET_BINARY,
  218. # '--server-response', # print headers for better error parsing
  219. '--no-verbose',
  220. '--adjust-extension',
  221. '--convert-links',
  222. '--force-directories',
  223. '--backup-converted',
  224. '--span-hosts',
  225. '--no-parent',
  226. '-e', 'robots=off',
  227. '--restrict-file-names=windows',
  228. '--timeout={}'.format(timeout),
  229. *([] if SAVE_WARC else ['--timestamping']),
  230. *(['--warc-file={}'.format(warc_path)] if SAVE_WARC else []),
  231. *(['--page-requisites'] if SAVE_WGET_REQUISITES else []),
  232. *(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []),
  233. *(['--load-cookies', COOKIES_FILE] if COOKIES_FILE else []),
  234. *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
  235. *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
  236. link.url,
  237. ]
  238. status = 'succeeded'
  239. timer = TimedProgress(timeout, prefix=' ')
  240. try:
  241. result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
  242. output = wget_output_path(link)
  243. # parse out number of files downloaded from last line of stderr:
  244. # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
  245. output_tail = [
  246. line.strip()
  247. for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
  248. if line.strip()
  249. ]
  250. files_downloaded = (
  251. int(output_tail[-1].strip().split(' ', 2)[1] or 0)
  252. if 'Downloaded:' in output_tail[-1]
  253. else 0
  254. )
  255. # Check for common failure cases
  256. if result.returncode > 0 and files_downloaded < 1:
  257. hints = (
  258. 'Got wget response code: {}.'.format(result.returncode),
  259. *output_tail,
  260. )
  261. if b'403: Forbidden' in result.stderr:
  262. raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
  263. if b'404: Not Found' in result.stderr:
  264. raise ArchiveError('404 Not Found', hints)
  265. if b'ERROR 500: Internal Server Error' in result.stderr:
  266. raise ArchiveError('500 Internal Server Error', hints)
  267. raise ArchiveError('Got an error from the server', hints)
  268. except Exception as err:
  269. status = 'failed'
  270. output = err
  271. finally:
  272. timer.end()
  273. return ArchiveResult(
  274. cmd=cmd,
  275. pwd=out_dir,
  276. cmd_version=WGET_VERSION,
  277. output=output,
  278. status=status,
  279. **timer.stats,
  280. )
  281. @enforce_types
  282. def should_save_pdf(link: Link, out_dir: Optional[str]=None) -> bool:
  283. out_dir = out_dir or link.link_dir
  284. if is_static_file(link.url):
  285. return False
  286. if os.path.exists(os.path.join(out_dir, 'output.pdf')):
  287. return False
  288. return SAVE_PDF
  289. @enforce_types
  290. def save_pdf(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  291. """print PDF of site to file using chrome --headless"""
  292. out_dir = out_dir or link.link_dir
  293. output: ArchiveOutput = 'output.pdf'
  294. cmd = [
  295. *chrome_args(TIMEOUT=timeout),
  296. '--print-to-pdf',
  297. link.url,
  298. ]
  299. status = 'succeeded'
  300. timer = TimedProgress(timeout, prefix=' ')
  301. try:
  302. result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
  303. if result.returncode:
  304. hints = (result.stderr or result.stdout).decode()
  305. raise ArchiveError('Failed to save PDF', hints)
  306. chmod_file('output.pdf', cwd=out_dir)
  307. except Exception as err:
  308. status = 'failed'
  309. output = err
  310. finally:
  311. timer.end()
  312. return ArchiveResult(
  313. cmd=cmd,
  314. pwd=out_dir,
  315. cmd_version=CHROME_VERSION,
  316. output=output,
  317. status=status,
  318. **timer.stats,
  319. )
  320. @enforce_types
  321. def should_save_screenshot(link: Link, out_dir: Optional[str]=None) -> bool:
  322. out_dir = out_dir or link.link_dir
  323. if is_static_file(link.url):
  324. return False
  325. if os.path.exists(os.path.join(out_dir, 'screenshot.png')):
  326. return False
  327. return SAVE_SCREENSHOT
  328. @enforce_types
  329. def save_screenshot(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  330. """take screenshot of site using chrome --headless"""
  331. out_dir = out_dir or link.link_dir
  332. output: ArchiveOutput = 'screenshot.png'
  333. cmd = [
  334. *chrome_args(TIMEOUT=timeout),
  335. '--screenshot',
  336. link.url,
  337. ]
  338. status = 'succeeded'
  339. timer = TimedProgress(timeout, prefix=' ')
  340. try:
  341. result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
  342. if result.returncode:
  343. hints = (result.stderr or result.stdout).decode()
  344. raise ArchiveError('Failed to save screenshot', hints)
  345. chmod_file(output, cwd=out_dir)
  346. except Exception as err:
  347. status = 'failed'
  348. output = err
  349. finally:
  350. timer.end()
  351. return ArchiveResult(
  352. cmd=cmd,
  353. pwd=out_dir,
  354. cmd_version=CHROME_VERSION,
  355. output=output,
  356. status=status,
  357. **timer.stats,
  358. )
  359. @enforce_types
  360. def should_save_dom(link: Link, out_dir: Optional[str]=None) -> bool:
  361. out_dir = out_dir or link.link_dir
  362. if is_static_file(link.url):
  363. return False
  364. if os.path.exists(os.path.join(out_dir, 'output.html')):
  365. return False
  366. return SAVE_DOM
  367. @enforce_types
  368. def save_dom(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  369. """print HTML of site to file using chrome --dump-html"""
  370. out_dir = out_dir or link.link_dir
  371. output: ArchiveOutput = 'output.html'
  372. output_path = os.path.join(out_dir, str(output))
  373. cmd = [
  374. *chrome_args(TIMEOUT=timeout),
  375. '--dump-dom',
  376. link.url
  377. ]
  378. status = 'succeeded'
  379. timer = TimedProgress(timeout, prefix=' ')
  380. try:
  381. with open(output_path, 'w+') as f:
  382. result = run(cmd, stdout=f, stderr=PIPE, cwd=out_dir, timeout=timeout)
  383. if result.returncode:
  384. hints = result.stderr.decode()
  385. raise ArchiveError('Failed to save DOM', hints)
  386. chmod_file(output, cwd=out_dir)
  387. except Exception as err:
  388. status = 'failed'
  389. output = err
  390. finally:
  391. timer.end()
  392. return ArchiveResult(
  393. cmd=cmd,
  394. pwd=out_dir,
  395. cmd_version=CHROME_VERSION,
  396. output=output,
  397. status=status,
  398. **timer.stats,
  399. )
  400. @enforce_types
  401. def should_save_git(link: Link, out_dir: Optional[str]=None) -> bool:
  402. out_dir = out_dir or link.link_dir
  403. if is_static_file(link.url):
  404. return False
  405. if os.path.exists(os.path.join(out_dir, 'git')):
  406. return False
  407. is_clonable_url = (
  408. (domain(link.url) in GIT_DOMAINS)
  409. or (extension(link.url) == 'git')
  410. )
  411. if not is_clonable_url:
  412. return False
  413. return SAVE_GIT
  414. @enforce_types
  415. def save_git(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  416. """download full site using git"""
  417. out_dir = out_dir or link.link_dir
  418. output: ArchiveOutput = 'git'
  419. output_path = os.path.join(out_dir, str(output))
  420. os.makedirs(output_path, exist_ok=True)
  421. cmd = [
  422. GIT_BINARY,
  423. 'clone',
  424. '--mirror',
  425. '--recursive',
  426. *([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
  427. without_query(without_fragment(link.url)),
  428. ]
  429. status = 'succeeded'
  430. timer = TimedProgress(timeout, prefix=' ')
  431. try:
  432. result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
  433. if result.returncode == 128:
  434. # ignore failed re-download when the folder already exists
  435. pass
  436. elif result.returncode > 0:
  437. hints = 'Got git response code: {}.'.format(result.returncode)
  438. raise ArchiveError('Failed to save git clone', hints)
  439. except Exception as err:
  440. status = 'failed'
  441. output = err
  442. finally:
  443. timer.end()
  444. return ArchiveResult(
  445. cmd=cmd,
  446. pwd=out_dir,
  447. cmd_version=GIT_VERSION,
  448. output=output,
  449. status=status,
  450. **timer.stats,
  451. )
  452. @enforce_types
  453. def should_save_media(link: Link, out_dir: Optional[str]=None) -> bool:
  454. out_dir = out_dir or link.link_dir
  455. if is_static_file(link.url):
  456. return False
  457. if os.path.exists(os.path.join(out_dir, 'media')):
  458. return False
  459. return SAVE_MEDIA
  460. @enforce_types
  461. def save_media(link: Link, out_dir: Optional[str]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
  462. """Download playlists or individual video, audio, and subtitles using youtube-dl"""
  463. out_dir = out_dir or link.link_dir
  464. output: ArchiveOutput = 'media'
  465. output_path = os.path.join(out_dir, str(output))
  466. os.makedirs(output_path, exist_ok=True)
  467. cmd = [
  468. YOUTUBEDL_BINARY,
  469. '--write-description',
  470. '--write-info-json',
  471. '--write-annotations',
  472. '--yes-playlist',
  473. '--write-thumbnail',
  474. '--no-call-home',
  475. '--no-check-certificate',
  476. '--user-agent',
  477. '--all-subs',
  478. '--extract-audio',
  479. '--keep-video',
  480. '--ignore-errors',
  481. '--geo-bypass',
  482. '--audio-format', 'mp3',
  483. '--audio-quality', '320K',
  484. '--embed-thumbnail',
  485. '--add-metadata',
  486. *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
  487. link.url,
  488. ]
  489. status = 'succeeded'
  490. timer = TimedProgress(timeout, prefix=' ')
  491. try:
  492. result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
  493. chmod_file(output, cwd=out_dir)
  494. if result.returncode:
  495. if (b'ERROR: Unsupported URL' in result.stderr
  496. or b'HTTP Error 404' in result.stderr
  497. or b'HTTP Error 403' in result.stderr
  498. or b'URL could be a direct video link' in result.stderr
  499. or b'Unable to extract container ID' in result.stderr):
  500. # These happen too frequently on non-media pages to warrant printing to console
  501. pass
  502. else:
  503. hints = (
  504. 'Got youtube-dl response code: {}.'.format(result.returncode),
  505. *result.stderr.decode().split('\n'),
  506. )
  507. raise ArchiveError('Failed to save media', hints)
  508. except Exception as err:
  509. status = 'failed'
  510. output = err
  511. finally:
  512. timer.end()
  513. return ArchiveResult(
  514. cmd=cmd,
  515. pwd=out_dir,
  516. cmd_version=YOUTUBEDL_VERSION,
  517. output=output,
  518. status=status,
  519. **timer.stats,
  520. )
  521. @enforce_types
  522. def should_save_archive_dot_org(link: Link, out_dir: Optional[str]=None) -> bool:
  523. out_dir = out_dir or link.link_dir
  524. if is_static_file(link.url):
  525. return False
  526. if os.path.exists(os.path.join(out_dir, 'archive.org.txt')):
  527. # if open(path, 'r').read().strip() != 'None':
  528. return False
  529. return SAVE_ARCHIVE_DOT_ORG
  530. @enforce_types
  531. def save_archive_dot_org(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  532. """submit site to archive.org for archiving via their service, save returned archive url"""
  533. out_dir = out_dir or link.link_dir
  534. output: ArchiveOutput = 'archive.org.txt'
  535. archive_org_url = None
  536. submit_url = 'https://web.archive.org/save/{}'.format(link.url)
  537. cmd = [
  538. CURL_BINARY,
  539. '--location',
  540. '--head',
  541. '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(VERSION), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
  542. '--max-time', str(timeout),
  543. *([] if CHECK_SSL_VALIDITY else ['--insecure']),
  544. submit_url,
  545. ]
  546. status = 'succeeded'
  547. timer = TimedProgress(timeout, prefix=' ')
  548. try:
  549. result = run(cmd, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout)
  550. content_location, errors = parse_archive_dot_org_response(result.stdout)
  551. if content_location:
  552. archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
  553. elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
  554. archive_org_url = None
  555. # raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url)))
  556. elif errors:
  557. raise ArchiveError(', '.join(errors))
  558. else:
  559. raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.')
  560. except Exception as err:
  561. status = 'failed'
  562. output = err
  563. finally:
  564. timer.end()
  565. if output and not isinstance(output, Exception):
  566. # instead of writing None when archive.org rejects the url write the
  567. # url to resubmit it to archive.org. This is so when the user visits
  568. # the URL in person, it will attempt to re-archive it, and it'll show the
  569. # nicer error message explaining why the url was rejected if it fails.
  570. archive_org_url = archive_org_url or submit_url
  571. with open(os.path.join(out_dir, str(output)), 'w', encoding='utf-8') as f:
  572. f.write(archive_org_url)
  573. chmod_file('archive.org.txt', cwd=out_dir)
  574. output = archive_org_url
  575. return ArchiveResult(
  576. cmd=cmd,
  577. pwd=out_dir,
  578. cmd_version=CURL_VERSION,
  579. output=output,
  580. status=status,
  581. **timer.stats,
  582. )
  583. @enforce_types
  584. def parse_archive_dot_org_response(response: bytes) -> Tuple[List[str], List[str]]:
  585. # Parse archive.org response headers
  586. headers: Dict[str, List[str]] = defaultdict(list)
  587. # lowercase all the header names and store in dict
  588. for header in response.splitlines():
  589. if b':' not in header or not header.strip():
  590. continue
  591. name, val = header.decode().split(':', 1)
  592. headers[name.lower().strip()].append(val.strip())
  593. # Get successful archive url in "content-location" header or any errors
  594. content_location = headers['content-location']
  595. errors = headers['x-archive-wayback-runtime-error']
  596. return content_location, errors