archive_methods.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693
  1. import os
  2. from typing import Dict, List, Tuple, Optional
  3. from collections import defaultdict
  4. from datetime import datetime
  5. from .schema import Link, ArchiveResult, ArchiveOutput
  6. from .index import (
  7. write_link_index,
  8. patch_links_index,
  9. load_json_link_index,
  10. )
  11. from .config import (
  12. CURL_BINARY,
  13. GIT_BINARY,
  14. WGET_BINARY,
  15. YOUTUBEDL_BINARY,
  16. FETCH_FAVICON,
  17. FETCH_TITLE,
  18. FETCH_WGET,
  19. FETCH_WGET_REQUISITES,
  20. FETCH_PDF,
  21. FETCH_SCREENSHOT,
  22. FETCH_DOM,
  23. FETCH_WARC,
  24. FETCH_GIT,
  25. FETCH_MEDIA,
  26. SUBMIT_ARCHIVE_DOT_ORG,
  27. TIMEOUT,
  28. MEDIA_TIMEOUT,
  29. GIT_DOMAINS,
  30. VERSION,
  31. WGET_USER_AGENT,
  32. CHECK_SSL_VALIDITY,
  33. COOKIES_FILE,
  34. CURL_VERSION,
  35. WGET_VERSION,
  36. CHROME_VERSION,
  37. GIT_VERSION,
  38. YOUTUBEDL_VERSION,
  39. WGET_AUTO_COMPRESSION,
  40. )
  41. from .util import (
  42. enforce_types,
  43. domain,
  44. extension,
  45. without_query,
  46. without_fragment,
  47. fetch_page_title,
  48. is_static_file,
  49. TimedProgress,
  50. chmod_file,
  51. wget_output_path,
  52. chrome_args,
  53. run, PIPE, DEVNULL,
  54. )
  55. from .logs import (
  56. log_link_archiving_started,
  57. log_link_archiving_finished,
  58. log_archive_method_started,
  59. log_archive_method_finished,
  60. )
  61. class ArchiveError(Exception):
  62. def __init__(self, message, hints=None):
  63. super().__init__(message)
  64. self.hints = hints
  65. @enforce_types
  66. def archive_link(link: Link, link_dir: Optional[str]=None) -> Link:
  67. """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
  68. ARCHIVE_METHODS = (
  69. ('title', should_fetch_title, fetch_title),
  70. ('favicon', should_fetch_favicon, fetch_favicon),
  71. ('wget', should_fetch_wget, fetch_wget),
  72. ('pdf', should_fetch_pdf, fetch_pdf),
  73. ('screenshot', should_fetch_screenshot, fetch_screenshot),
  74. ('dom', should_fetch_dom, fetch_dom),
  75. ('git', should_fetch_git, fetch_git),
  76. ('media', should_fetch_media, fetch_media),
  77. ('archive_org', should_fetch_archive_dot_org, archive_dot_org),
  78. )
  79. link_dir = link_dir or link.link_dir
  80. try:
  81. is_new = not os.path.exists(link_dir)
  82. if is_new:
  83. os.makedirs(link_dir)
  84. link = load_json_link_index(link, link_dir=link_dir)
  85. log_link_archiving_started(link, link_dir, is_new)
  86. link = link.overwrite(updated=datetime.now())
  87. stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
  88. for method_name, should_run, method_function in ARCHIVE_METHODS:
  89. try:
  90. if method_name not in link.history:
  91. link.history[method_name] = []
  92. if should_run(link, link_dir):
  93. log_archive_method_started(method_name)
  94. result = method_function(link=link, link_dir=link_dir)
  95. link.history[method_name].append(result)
  96. stats[result.status] += 1
  97. log_archive_method_finished(result)
  98. else:
  99. stats['skipped'] += 1
  100. except Exception as e:
  101. raise Exception('Exception in archive_methods.fetch_{}(Link(url={}))'.format(
  102. method_name,
  103. link.url,
  104. )) from e
  105. # print(' ', stats)
  106. # If any changes were made, update the link index json and html
  107. write_link_index(link, link_dir=link.link_dir)
  108. was_changed = stats['succeeded'] or stats['failed']
  109. if was_changed:
  110. patch_links_index(link)
  111. log_link_archiving_finished(link, link.link_dir, is_new, stats)
  112. except KeyboardInterrupt:
  113. try:
  114. write_link_index(link, link_dir=link.link_dir)
  115. except:
  116. pass
  117. raise
  118. except Exception as err:
  119. print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
  120. raise
  121. return link
  122. ### Archive Method Functions
  123. @enforce_types
  124. def should_fetch_title(link: Link, link_dir: Optional[str]=None) -> bool:
  125. # if link already has valid title, skip it
  126. if link.title and not link.title.lower().startswith('http'):
  127. return False
  128. if is_static_file(link.url):
  129. return False
  130. return FETCH_TITLE
  131. @enforce_types
  132. def fetch_title(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  133. """try to guess the page's title from its content"""
  134. output: ArchiveOutput = None
  135. cmd = [
  136. CURL_BINARY,
  137. link.url,
  138. '|',
  139. 'grep',
  140. '<title',
  141. ]
  142. status = 'succeeded'
  143. timer = TimedProgress(timeout, prefix=' ')
  144. try:
  145. output = fetch_page_title(link.url, timeout=timeout, progress=False)
  146. if not output:
  147. raise ArchiveError('Unable to detect page title')
  148. except Exception as err:
  149. status = 'failed'
  150. output = err
  151. finally:
  152. timer.end()
  153. return ArchiveResult(
  154. cmd=cmd,
  155. pwd=link_dir,
  156. cmd_version=CURL_VERSION,
  157. output=output,
  158. status=status,
  159. **timer.stats,
  160. )
  161. @enforce_types
  162. def should_fetch_favicon(link: Link, link_dir: Optional[str]=None) -> bool:
  163. link_dir = link_dir or link.link_dir
  164. if os.path.exists(os.path.join(link_dir, 'favicon.ico')):
  165. return False
  166. return FETCH_FAVICON
  167. @enforce_types
  168. def fetch_favicon(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  169. """download site favicon from google's favicon api"""
  170. link_dir = link_dir or link.link_dir
  171. output: ArchiveOutput = 'favicon.ico'
  172. cmd = [
  173. CURL_BINARY,
  174. '--max-time', str(timeout),
  175. '--location',
  176. '--output', str(output),
  177. *([] if CHECK_SSL_VALIDITY else ['--insecure']),
  178. 'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)),
  179. ]
  180. status = 'succeeded'
  181. timer = TimedProgress(timeout, prefix=' ')
  182. try:
  183. run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
  184. chmod_file(output, cwd=link_dir)
  185. except Exception as err:
  186. status = 'failed'
  187. output = err
  188. finally:
  189. timer.end()
  190. return ArchiveResult(
  191. cmd=cmd,
  192. pwd=link_dir,
  193. cmd_version=CURL_VERSION,
  194. output=output,
  195. status=status,
  196. **timer.stats,
  197. )
  198. @enforce_types
  199. def should_fetch_wget(link: Link, link_dir: Optional[str]=None) -> bool:
  200. output_path = wget_output_path(link)
  201. link_dir = link_dir or link.link_dir
  202. if output_path and os.path.exists(os.path.join(link_dir, output_path)):
  203. return False
  204. return FETCH_WGET
  205. @enforce_types
  206. def fetch_wget(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  207. """download full site using wget"""
  208. link_dir = link_dir or link.link_dir
  209. if FETCH_WARC:
  210. warc_dir = os.path.join(link_dir, 'warc')
  211. os.makedirs(warc_dir, exist_ok=True)
  212. warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
  213. # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
  214. output: ArchiveOutput = None
  215. cmd = [
  216. WGET_BINARY,
  217. # '--server-response', # print headers for better error parsing
  218. '--no-verbose',
  219. '--adjust-extension',
  220. '--convert-links',
  221. '--force-directories',
  222. '--backup-converted',
  223. '--span-hosts',
  224. '--no-parent',
  225. '-e', 'robots=off',
  226. '--restrict-file-names=unix',
  227. '--timeout={}'.format(timeout),
  228. *([] if FETCH_WARC else ['--timestamping']),
  229. *(['--warc-file={}'.format(warc_path)] if FETCH_WARC else []),
  230. *(['--page-requisites'] if FETCH_WGET_REQUISITES else []),
  231. *(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []),
  232. *(['--load-cookies', COOKIES_FILE] if COOKIES_FILE else []),
  233. *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
  234. *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
  235. link.url,
  236. ]
  237. status = 'succeeded'
  238. timer = TimedProgress(timeout, prefix=' ')
  239. try:
  240. result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
  241. output = wget_output_path(link)
  242. # parse out number of files downloaded from last line of stderr:
  243. # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
  244. output_tail = [
  245. line.strip()
  246. for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
  247. if line.strip()
  248. ]
  249. files_downloaded = (
  250. int(output_tail[-1].strip().split(' ', 2)[1] or 0)
  251. if 'Downloaded:' in output_tail[-1]
  252. else 0
  253. )
  254. # Check for common failure cases
  255. if result.returncode > 0 and files_downloaded < 1:
  256. hints = (
  257. 'Got wget response code: {}.'.format(result.returncode),
  258. *output_tail,
  259. )
  260. if b'403: Forbidden' in result.stderr:
  261. raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
  262. if b'404: Not Found' in result.stderr:
  263. raise ArchiveError('404 Not Found', hints)
  264. if b'ERROR 500: Internal Server Error' in result.stderr:
  265. raise ArchiveError('500 Internal Server Error', hints)
  266. raise ArchiveError('Got an error from the server', hints)
  267. except Exception as err:
  268. status = 'failed'
  269. output = err
  270. finally:
  271. timer.end()
  272. return ArchiveResult(
  273. cmd=cmd,
  274. pwd=link_dir,
  275. cmd_version=WGET_VERSION,
  276. output=output,
  277. status=status,
  278. **timer.stats,
  279. )
  280. @enforce_types
  281. def should_fetch_pdf(link: Link, link_dir: Optional[str]=None) -> bool:
  282. link_dir = link_dir or link.link_dir
  283. if is_static_file(link.url):
  284. return False
  285. if os.path.exists(os.path.join(link_dir, 'output.pdf')):
  286. return False
  287. return FETCH_PDF
  288. @enforce_types
  289. def fetch_pdf(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  290. """print PDF of site to file using chrome --headless"""
  291. link_dir = link_dir or link.link_dir
  292. output: ArchiveOutput = 'output.pdf'
  293. cmd = [
  294. *chrome_args(TIMEOUT=timeout),
  295. '--print-to-pdf',
  296. link.url,
  297. ]
  298. status = 'succeeded'
  299. timer = TimedProgress(timeout, prefix=' ')
  300. try:
  301. result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
  302. if result.returncode:
  303. hints = (result.stderr or result.stdout).decode()
  304. raise ArchiveError('Failed to print PDF', hints)
  305. chmod_file('output.pdf', cwd=link_dir)
  306. except Exception as err:
  307. status = 'failed'
  308. output = err
  309. finally:
  310. timer.end()
  311. return ArchiveResult(
  312. cmd=cmd,
  313. pwd=link_dir,
  314. cmd_version=CHROME_VERSION,
  315. output=output,
  316. status=status,
  317. **timer.stats,
  318. )
  319. @enforce_types
  320. def should_fetch_screenshot(link: Link, link_dir: Optional[str]=None) -> bool:
  321. link_dir = link_dir or link.link_dir
  322. if is_static_file(link.url):
  323. return False
  324. if os.path.exists(os.path.join(link_dir, 'screenshot.png')):
  325. return False
  326. return FETCH_SCREENSHOT
  327. @enforce_types
  328. def fetch_screenshot(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  329. """take screenshot of site using chrome --headless"""
  330. link_dir = link_dir or link.link_dir
  331. output: ArchiveOutput = 'screenshot.png'
  332. cmd = [
  333. *chrome_args(TIMEOUT=timeout),
  334. '--screenshot',
  335. link.url,
  336. ]
  337. status = 'succeeded'
  338. timer = TimedProgress(timeout, prefix=' ')
  339. try:
  340. result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
  341. if result.returncode:
  342. hints = (result.stderr or result.stdout).decode()
  343. raise ArchiveError('Failed to take screenshot', hints)
  344. chmod_file(output, cwd=link_dir)
  345. except Exception as err:
  346. status = 'failed'
  347. output = err
  348. finally:
  349. timer.end()
  350. return ArchiveResult(
  351. cmd=cmd,
  352. pwd=link_dir,
  353. cmd_version=CHROME_VERSION,
  354. output=output,
  355. status=status,
  356. **timer.stats,
  357. )
  358. @enforce_types
  359. def should_fetch_dom(link: Link, link_dir: Optional[str]=None) -> bool:
  360. link_dir = link_dir or link.link_dir
  361. if is_static_file(link.url):
  362. return False
  363. if os.path.exists(os.path.join(link_dir, 'output.html')):
  364. return False
  365. return FETCH_DOM
  366. @enforce_types
  367. def fetch_dom(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  368. """print HTML of site to file using chrome --dump-html"""
  369. link_dir = link_dir or link.link_dir
  370. output: ArchiveOutput = 'output.html'
  371. output_path = os.path.join(link_dir, str(output))
  372. cmd = [
  373. *chrome_args(TIMEOUT=timeout),
  374. '--dump-dom',
  375. link.url
  376. ]
  377. status = 'succeeded'
  378. timer = TimedProgress(timeout, prefix=' ')
  379. try:
  380. with open(output_path, 'w+') as f:
  381. result = run(cmd, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout)
  382. if result.returncode:
  383. hints = result.stderr.decode()
  384. raise ArchiveError('Failed to fetch DOM', hints)
  385. chmod_file(output, cwd=link_dir)
  386. except Exception as err:
  387. status = 'failed'
  388. output = err
  389. finally:
  390. timer.end()
  391. return ArchiveResult(
  392. cmd=cmd,
  393. pwd=link_dir,
  394. cmd_version=CHROME_VERSION,
  395. output=output,
  396. status=status,
  397. **timer.stats,
  398. )
  399. @enforce_types
  400. def should_fetch_git(link: Link, link_dir: Optional[str]=None) -> bool:
  401. link_dir = link_dir or link.link_dir
  402. if is_static_file(link.url):
  403. return False
  404. if os.path.exists(os.path.join(link_dir, 'git')):
  405. return False
  406. is_clonable_url = (
  407. (domain(link.url) in GIT_DOMAINS)
  408. or (extension(link.url) == 'git')
  409. )
  410. if not is_clonable_url:
  411. return False
  412. return FETCH_GIT
  413. @enforce_types
  414. def fetch_git(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  415. """download full site using git"""
  416. link_dir = link_dir or link.link_dir
  417. output: ArchiveOutput = 'git'
  418. output_path = os.path.join(link_dir, str(output))
  419. os.makedirs(output_path, exist_ok=True)
  420. cmd = [
  421. GIT_BINARY,
  422. 'clone',
  423. '--mirror',
  424. '--recursive',
  425. *([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
  426. without_query(without_fragment(link.url)),
  427. ]
  428. status = 'succeeded'
  429. timer = TimedProgress(timeout, prefix=' ')
  430. try:
  431. result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
  432. if result.returncode == 128:
  433. # ignore failed re-download when the folder already exists
  434. pass
  435. elif result.returncode > 0:
  436. hints = 'Got git response code: {}.'.format(result.returncode)
  437. raise ArchiveError('Failed git download', hints)
  438. except Exception as err:
  439. status = 'failed'
  440. output = err
  441. finally:
  442. timer.end()
  443. return ArchiveResult(
  444. cmd=cmd,
  445. pwd=link_dir,
  446. cmd_version=GIT_VERSION,
  447. output=output,
  448. status=status,
  449. **timer.stats,
  450. )
  451. @enforce_types
  452. def should_fetch_media(link: Link, link_dir: Optional[str]=None) -> bool:
  453. link_dir = link_dir or link.link_dir
  454. if is_static_file(link.url):
  455. return False
  456. if os.path.exists(os.path.join(link_dir, 'media')):
  457. return False
  458. return FETCH_MEDIA
  459. @enforce_types
  460. def fetch_media(link: Link, link_dir: Optional[str]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
  461. """Download playlists or individual video, audio, and subtitles using youtube-dl"""
  462. link_dir = link_dir or link.link_dir
  463. output: ArchiveOutput = 'media'
  464. output_path = os.path.join(link_dir, str(output))
  465. os.makedirs(output_path, exist_ok=True)
  466. cmd = [
  467. YOUTUBEDL_BINARY,
  468. '--write-description',
  469. '--write-info-json',
  470. '--write-annotations',
  471. '--yes-playlist',
  472. '--write-thumbnail',
  473. '--no-call-home',
  474. '--no-check-certificate',
  475. '--user-agent',
  476. '--all-subs',
  477. '--extract-audio',
  478. '--keep-video',
  479. '--ignore-errors',
  480. '--geo-bypass',
  481. '--audio-format', 'mp3',
  482. '--audio-quality', '320K',
  483. '--embed-thumbnail',
  484. '--add-metadata',
  485. *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
  486. link.url,
  487. ]
  488. status = 'succeeded'
  489. timer = TimedProgress(timeout, prefix=' ')
  490. try:
  491. result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
  492. chmod_file(output, cwd=link_dir)
  493. if result.returncode:
  494. if (b'ERROR: Unsupported URL' in result.stderr
  495. or b'HTTP Error 404' in result.stderr
  496. or b'HTTP Error 403' in result.stderr
  497. or b'URL could be a direct video link' in result.stderr
  498. or b'Unable to extract container ID' in result.stderr):
  499. # These happen too frequently on non-media pages to warrant printing to console
  500. pass
  501. else:
  502. hints = (
  503. 'Got youtube-dl response code: {}.'.format(result.returncode),
  504. *result.stderr.decode().split('\n'),
  505. )
  506. raise ArchiveError('Failed to download media', hints)
  507. except Exception as err:
  508. status = 'failed'
  509. output = err
  510. finally:
  511. timer.end()
  512. return ArchiveResult(
  513. cmd=cmd,
  514. pwd=link_dir,
  515. cmd_version=YOUTUBEDL_VERSION,
  516. output=output,
  517. status=status,
  518. **timer.stats,
  519. )
  520. @enforce_types
  521. def should_fetch_archive_dot_org(link: Link, link_dir: Optional[str]=None) -> bool:
  522. link_dir = link_dir or link.link_dir
  523. if is_static_file(link.url):
  524. return False
  525. if os.path.exists(os.path.join(link_dir, 'archive.org.txt')):
  526. # if open(path, 'r').read().strip() != 'None':
  527. return False
  528. return SUBMIT_ARCHIVE_DOT_ORG
  529. @enforce_types
  530. def archive_dot_org(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  531. """submit site to archive.org for archiving via their service, save returned archive url"""
  532. link_dir = link_dir or link.link_dir
  533. output: ArchiveOutput = 'archive.org.txt'
  534. archive_org_url = None
  535. submit_url = 'https://web.archive.org/save/{}'.format(link.url)
  536. cmd = [
  537. CURL_BINARY,
  538. '--location',
  539. '--head',
  540. '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(VERSION), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
  541. '--max-time', str(timeout),
  542. *([] if CHECK_SSL_VALIDITY else ['--insecure']),
  543. submit_url,
  544. ]
  545. status = 'succeeded'
  546. timer = TimedProgress(timeout, prefix=' ')
  547. try:
  548. result = run(cmd, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout)
  549. content_location, errors = parse_archive_dot_org_response(result.stdout)
  550. if content_location:
  551. archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
  552. elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
  553. archive_org_url = None
  554. # raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url)))
  555. elif errors:
  556. raise ArchiveError(', '.join(errors))
  557. else:
  558. raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.')
  559. except Exception as err:
  560. status = 'failed'
  561. output = err
  562. finally:
  563. timer.end()
  564. if output and not isinstance(output, Exception):
  565. # instead of writing None when archive.org rejects the url write the
  566. # url to resubmit it to archive.org. This is so when the user visits
  567. # the URL in person, it will attempt to re-archive it, and it'll show the
  568. # nicer error message explaining why the url was rejected if it fails.
  569. archive_org_url = archive_org_url or submit_url
  570. with open(os.path.join(link_dir, str(output)), 'w', encoding='utf-8') as f:
  571. f.write(archive_org_url)
  572. chmod_file('archive.org.txt', cwd=link_dir)
  573. output = archive_org_url
  574. return ArchiveResult(
  575. cmd=cmd,
  576. pwd=link_dir,
  577. cmd_version=CURL_VERSION,
  578. output=output,
  579. status=status,
  580. **timer.stats,
  581. )
  582. @enforce_types
  583. def parse_archive_dot_org_response(response: bytes) -> Tuple[List[str], List[str]]:
  584. # Parse archive.org response headers
  585. headers: Dict[str, List[str]] = defaultdict(list)
  586. # lowercase all the header names and store in dict
  587. for header in response.splitlines():
  588. if b':' not in header or not header.strip():
  589. continue
  590. name, val = header.decode().split(':', 1)
  591. headers[name.lower().strip()].append(val.strip())
  592. # Get successful archive url in "content-location" header or any errors
  593. content_location = headers['content-location']
  594. errors = headers['x-archive-wayback-runtime-error']
  595. return content_location, errors