archive_org.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. __package__ = 'archivebox.extractors'
  2. from pathlib import Path
  3. from typing import Optional, List, Dict, Tuple
  4. from collections import defaultdict
  5. from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
  6. from ..system import run, chmod_file
  7. from ..util import (
  8. enforce_types,
  9. is_static_file,
  10. dedupe,
  11. )
  12. from ..config import (
  13. TIMEOUT,
  14. CURL_ARGS,
  15. CURL_EXTRA_ARGS,
  16. CHECK_SSL_VALIDITY,
  17. SAVE_ARCHIVE_DOT_ORG,
  18. CURL_BINARY,
  19. CURL_VERSION,
  20. CURL_USER_AGENT,
  21. )
  22. from ..logging_util import TimedProgress
  23. @enforce_types
  24. def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
  25. if is_static_file(link.url):
  26. return False
  27. out_dir = out_dir or Path(link.link_dir)
  28. if not overwrite and (out_dir / 'archive.org.txt').exists():
  29. # if open(path, 'r', encoding='utf-8').read().strip() != 'None':
  30. return False
  31. return SAVE_ARCHIVE_DOT_ORG
  32. @enforce_types
  33. def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  34. """submit site to archive.org for archiving via their service, save returned archive url"""
  35. out_dir = out_dir or Path(link.link_dir)
  36. output: ArchiveOutput = 'archive.org.txt'
  37. archive_org_url = None
  38. submit_url = 'https://web.archive.org/save/{}'.format(link.url)
  39. # later options take precedence
  40. options = [
  41. *CURL_ARGS,
  42. *CURL_EXTRA_ARGS,
  43. '--head',
  44. '--max-time', str(timeout),
  45. *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
  46. *([] if CHECK_SSL_VALIDITY else ['--insecure']),
  47. ]
  48. cmd = [
  49. CURL_BINARY,
  50. *dedupe(options),
  51. submit_url,
  52. ]
  53. status = 'succeeded'
  54. timer = TimedProgress(timeout, prefix=' ')
  55. try:
  56. result = run(cmd, cwd=str(out_dir), timeout=timeout)
  57. content_location, errors = parse_archive_dot_org_response(result.stdout)
  58. if content_location:
  59. archive_org_url = content_location[0]
  60. elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
  61. archive_org_url = None
  62. # raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url)))
  63. elif errors:
  64. raise ArchiveError(', '.join(errors))
  65. else:
  66. raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.')
  67. except Exception as err:
  68. status = 'failed'
  69. output = err
  70. finally:
  71. timer.end()
  72. if output and not isinstance(output, Exception):
  73. # instead of writing None when archive.org rejects the url write the
  74. # url to resubmit it to archive.org. This is so when the user visits
  75. # the URL in person, it will attempt to re-archive it, and it'll show the
  76. # nicer error message explaining why the url was rejected if it fails.
  77. archive_org_url = archive_org_url or submit_url
  78. with open(str(out_dir / output), 'w', encoding='utf-8') as f:
  79. f.write(archive_org_url)
  80. chmod_file('archive.org.txt', cwd=str(out_dir))
  81. output = archive_org_url
  82. return ArchiveResult(
  83. cmd=cmd,
  84. pwd=str(out_dir),
  85. cmd_version=CURL_VERSION,
  86. output=output,
  87. status=status,
  88. **timer.stats,
  89. )
  90. @enforce_types
  91. def parse_archive_dot_org_response(response: bytes) -> Tuple[List[str], List[str]]:
  92. # Parse archive.org response headers
  93. headers: Dict[str, List[str]] = defaultdict(list)
  94. # lowercase all the header names and store in dict
  95. for header in response.splitlines():
  96. if b':' not in header or not header.strip():
  97. continue
  98. name, val = header.decode().split(':', 1)
  99. headers[name.lower().strip()].append(val.strip())
  100. # Get successful archive url in "content-location" header or any errors
  101. content_location = headers.get('content-location', headers['location'])
  102. errors = headers['x-archive-wayback-runtime-error']
  103. return content_location, errors