util.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686
  1. import os
  2. import re
  3. import sys
  4. import time
  5. import json
  6. from urllib.request import Request, urlopen
  7. from urllib.parse import urlparse
  8. from decimal import Decimal
  9. from urllib.parse import quote
  10. from datetime import datetime
  11. from subprocess import TimeoutExpired, Popen, PIPE, DEVNULL, CompletedProcess, CalledProcessError
  12. from multiprocessing import Process
  13. from config import (
  14. ANSI,
  15. IS_TTY,
  16. TERM_WIDTH,
  17. REPO_DIR,
  18. OUTPUT_DIR,
  19. SOURCES_DIR,
  20. ARCHIVE_DIR,
  21. OUTPUT_PERMISSIONS,
  22. TIMEOUT,
  23. SHOW_PROGRESS,
  24. CHECK_SSL_VALIDITY,
  25. WGET_USER_AGENT,
  26. CURL_BINARY,
  27. WGET_BINARY,
  28. CHROME_BINARY,
  29. GIT_BINARY,
  30. YOUTUBEDL_BINARY,
  31. FETCH_TITLE,
  32. FETCH_FAVICON,
  33. FETCH_WGET,
  34. FETCH_WARC,
  35. FETCH_PDF,
  36. FETCH_SCREENSHOT,
  37. FETCH_DOM,
  38. FETCH_GIT,
  39. FETCH_MEDIA,
  40. SUBMIT_ARCHIVE_DOT_ORG,
  41. )
  42. # URL helpers: https://docs.python.org/3/library/urllib.parse.html#url-parsing
  43. scheme = lambda url: urlparse(url).scheme
  44. without_scheme = lambda url: urlparse(url)._replace(scheme='').geturl().strip('//')
  45. without_query = lambda url: urlparse(url)._replace(query='').geturl().strip('//')
  46. without_fragment = lambda url: urlparse(url)._replace(fragment='').geturl().strip('//')
  47. without_path = lambda url: urlparse(url)._replace(path='', fragment='', query='').geturl().strip('//')
  48. path = lambda url: urlparse(url).path
  49. basename = lambda url: urlparse(url).path.rsplit('/', 1)[-1]
  50. domain = lambda url: urlparse(url).netloc
  51. query = lambda url: urlparse(url).query
  52. fragment = lambda url: urlparse(url).fragment
  53. extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basename(url) else ''
  54. base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
  55. short_ts = lambda ts: ts.split('.')[0]
  56. URL_REGEX = re.compile(
  57. r'http[s]?://' # start matching from allowed schemes
  58. r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
  59. r'|[$-_@.&+]|[!*\(\),]' # or allowed symbols
  60. r'|(?:%[0-9a-fA-F][0-9a-fA-F]))' # or allowed unicode bytes
  61. r'[^\]\[\(\)<>\""\'\s]+', # stop parsing at these symbols
  62. re.IGNORECASE,
  63. )
  64. HTML_TITLE_REGEX = re.compile(
  65. r'<title>' # start matching text after <title> tag
  66. r'(.[^<>]+)', # get everything up to these symbols
  67. re.IGNORECASE,
  68. )
  69. def check_dependencies():
  70. """Check that all necessary dependencies are installed, and have valid versions"""
  71. python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
  72. if python_vers < 3.5:
  73. print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
  74. print(' See https://github.com/pirate/ArchiveBox#troubleshooting for help upgrading your Python installation.')
  75. raise SystemExit(1)
  76. if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
  77. if run(['which', CURL_BINARY], stdout=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL).returncode:
  78. print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
  79. print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CURL_BINARY))
  80. print(' See https://github.com/pirate/ArchiveBox for help.')
  81. raise SystemExit(1)
  82. if FETCH_WGET or FETCH_WARC:
  83. if run(['which', WGET_BINARY], stdout=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL).returncode:
  84. print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
  85. print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(WGET_BINARY))
  86. print(' See https://github.com/pirate/ArchiveBox for help.')
  87. raise SystemExit(1)
  88. if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
  89. if run(['which', CHROME_BINARY], stdout=DEVNULL).returncode:
  90. print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
  91. print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY))
  92. print(' See https://github.com/pirate/ArchiveBox for help.')
  93. raise SystemExit(1)
  94. # parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
  95. try:
  96. result = run([CHROME_BINARY, '--version'], stdout=PIPE)
  97. version_str = result.stdout.decode('utf-8')
  98. version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n')
  99. version = [l for l in version_lines if l.isdigit()][-1]
  100. if int(version) < 59:
  101. print(version_lines)
  102. print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI))
  103. print(' See https://github.com/pirate/ArchiveBox for help.')
  104. raise SystemExit(1)
  105. except (IndexError, TypeError, OSError):
  106. print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
  107. print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY))
  108. print(' See https://github.com/pirate/ArchiveBox for help.')
  109. raise SystemExit(1)
  110. if FETCH_GIT:
  111. if run(['which', GIT_BINARY], stdout=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL).returncode:
  112. print('{red}[X] Missing dependency: git{reset}'.format(**ANSI))
  113. print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(GIT_BINARY))
  114. print(' See https://github.com/pirate/ArchiveBox for help.')
  115. raise SystemExit(1)
  116. if FETCH_MEDIA:
  117. if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL).returncode:
  118. print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
  119. print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY))
  120. print(' See https://github.com/pirate/ArchiveBox for help.')
  121. raise SystemExit(1)
  122. def check_url_parsing():
  123. """Check that plain text regex URL parsing works as expected"""
  124. test_urls = '''
  125. https://example1.com/what/is/happening.html?what=1#how-about-this=1
  126. https://example2.com/what/is/happening/?what=1#how-about-this=1
  127. HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
  128. https://example4.com/what/is/happening.html
  129. https://example5.com/
  130. https://example6.com
  131. <test>http://example7.com</test>
  132. [https://example8.com/what/is/this.php?what=1]
  133. [and http://example9.com?what=1&other=3#and-thing=2]
  134. <what>https://example10.com#and-thing=2 "</about>
  135. abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
  136. sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi
  137. example13.bada
  138. and example14.badb
  139. <or>htt://example15.badc</that>
  140. '''
  141. # print('\n'.join(re.findall(URL_REGEX, test_urls)))
  142. assert len(re.findall(URL_REGEX, test_urls)) == 12
  143. def chmod_file(path, cwd='.', permissions=OUTPUT_PERMISSIONS, timeout=30):
  144. """chmod -R <permissions> <cwd>/<path>"""
  145. if not os.path.exists(os.path.join(cwd, path)):
  146. raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
  147. chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout)
  148. if chmod_result.returncode == 1:
  149. print(' ', chmod_result.stderr.decode())
  150. raise Exception('Failed to chmod {}/{}'.format(cwd, path))
  151. def progress(seconds=TIMEOUT, prefix=''):
  152. """Show a (subprocess-controlled) progress bar with a <seconds> timeout,
  153. returns end() function to instantly finish the progress
  154. """
  155. if not SHOW_PROGRESS:
  156. return lambda: None
  157. def progress_bar(seconds, prefix):
  158. """show timer in the form of progress bar, with percentage and seconds remaining"""
  159. chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#'
  160. chunks = TERM_WIDTH - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
  161. try:
  162. for s in range(seconds * chunks):
  163. progress = s / chunks / seconds * 100
  164. bar_width = round(progress/(100/chunks))
  165. # ████████████████████ 0.9% (1/60sec)
  166. sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
  167. prefix,
  168. ANSI['green'],
  169. (chunk * bar_width).ljust(chunks),
  170. ANSI['reset'],
  171. round(progress, 1),
  172. round(s/chunks),
  173. seconds,
  174. ))
  175. sys.stdout.flush()
  176. time.sleep(1 / chunks)
  177. # ██████████████████████████████████ 100.0% (60/60sec)
  178. sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format(
  179. prefix,
  180. ANSI['red'],
  181. chunk * chunks,
  182. ANSI['reset'],
  183. 100.0,
  184. seconds,
  185. seconds,
  186. ))
  187. sys.stdout.flush()
  188. except KeyboardInterrupt:
  189. print()
  190. pass
  191. p = Process(target=progress_bar, args=(seconds, prefix))
  192. p.start()
  193. def end():
  194. """immediately finish progress and clear the progressbar line"""
  195. # protect from double termination
  196. #if p is None or not hasattr(p, 'kill'):
  197. # return
  198. nonlocal p
  199. if p is not None:
  200. p.terminate()
  201. p = None
  202. sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH), ANSI['reset'])) # clear whole terminal line
  203. sys.stdout.flush()
  204. return end
  205. def pretty_path(path):
  206. """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
  207. return path.replace(REPO_DIR + '/', '')
  208. def save_stdin_source(raw_text):
  209. if not os.path.exists(SOURCES_DIR):
  210. os.makedirs(SOURCES_DIR)
  211. ts = str(datetime.now().timestamp()).split('.', 1)[0]
  212. source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format('stdin', ts))
  213. with open(source_path, 'w', encoding='utf-8') as f:
  214. f.write(raw_text)
  215. return source_path
  216. def fetch_page_content(url, timeout=TIMEOUT):
  217. req = Request(url, headers={'User-Agent': WGET_USER_AGENT})
  218. if CHECK_SSL_VALIDITY:
  219. resp = urlopen(req, timeout=timeout)
  220. else:
  221. import ssl
  222. insecure = ssl._create_unverified_context()
  223. resp = urlopen(req, timeout=timeout, context=insecure)
  224. encoding = resp.headers.get_content_charset() or 'utf-8'
  225. return resp.read().decode(encoding)
  226. def save_remote_source(url, timeout=TIMEOUT):
  227. """download a given url's content into downloads/domain.txt"""
  228. if not os.path.exists(SOURCES_DIR):
  229. os.makedirs(SOURCES_DIR)
  230. ts = str(datetime.now().timestamp()).split('.', 1)[0]
  231. source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(url), ts))
  232. print('{}[*] [{}] Downloading {}{}'.format(
  233. ANSI['green'],
  234. datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  235. url,
  236. ANSI['reset'],
  237. ))
  238. end = progress(TIMEOUT, prefix=' ')
  239. try:
  240. downloaded_xml = fetch_page_content(url, timeout=timeout)
  241. end()
  242. except Exception as e:
  243. end()
  244. print('{}[!] Failed to download {}{}\n'.format(
  245. ANSI['red'],
  246. url,
  247. ANSI['reset'],
  248. ))
  249. print(' ', e)
  250. raise SystemExit(1)
  251. with open(source_path, 'w', encoding='utf-8') as f:
  252. f.write(downloaded_xml)
  253. print(' > {}'.format(pretty_path(source_path)))
  254. return source_path
  255. def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
  256. """Attempt to guess a page's title by downloading the html"""
  257. if not FETCH_TITLE:
  258. return None
  259. try:
  260. if progress:
  261. sys.stdout.write('.')
  262. sys.stdout.flush()
  263. html = fetch_page_content(url, timeout=timeout)
  264. match = re.search(HTML_TITLE_REGEX, html)
  265. return match.group(1).strip() if match else None
  266. except Exception as err:
  267. # print('[!] Failed to fetch title because of {}: {}'.format(
  268. # err.__class__.__name__,
  269. # err,
  270. # ))
  271. return None
  272. def str_between(string, start, end=None):
  273. """(<abc>12345</def>, <abc>, </def>) -> 12345"""
  274. content = string.split(start, 1)[-1]
  275. if end is not None:
  276. content = content.rsplit(end, 1)[0]
  277. return content
  278. def get_link_type(link):
  279. """Certain types of links need to be handled specially, this figures out when that's the case"""
  280. if extension(link['url']) == 'pdf':
  281. return 'PDF'
  282. elif extension(link['url']) in ('pdf', 'png', 'jpg', 'jpeg', 'svg', 'bmp', 'gif', 'tiff', 'webp'):
  283. return 'image'
  284. elif 'wikipedia.org' in domain(link['url']).lower():
  285. return 'wiki'
  286. elif 'youtube.com' in domain(link['url']).lower():
  287. return 'youtube'
  288. elif 'soundcloud.com' in domain(link['url']).lower():
  289. return 'soundcloud'
  290. elif 'youku.com' in domain(link['url']).lower():
  291. return 'youku'
  292. elif 'vimeo.com' in domain(link['url']).lower():
  293. return 'vimeo'
  294. return None
  295. def merge_links(a, b):
  296. """deterministially merge two links, favoring longer field values over shorter,
  297. and "cleaner" values over worse ones.
  298. """
  299. longer = lambda key: (a[key] if len(a[key]) > len(b[key]) else b[key]) if (a[key] and b[key]) else (a[key] or b[key])
  300. earlier = lambda key: a[key] if a[key] < b[key] else b[key]
  301. url = longer('url')
  302. longest_title = longer('title')
  303. cleanest_title = a['title'] if '://' not in (a['title'] or '') else b['title']
  304. link = {
  305. 'timestamp': earlier('timestamp'),
  306. 'url': url,
  307. 'domain': domain(url),
  308. 'base_url': base_url(url),
  309. 'tags': longer('tags'),
  310. 'title': longest_title if '://' not in (longest_title or '') else cleanest_title,
  311. 'sources': list(set(a.get('sources', []) + b.get('sources', []))),
  312. }
  313. link['type'] = get_link_type(link)
  314. return link
  315. def find_link(folder, links):
  316. """for a given archive folder, find the corresponding link object in links"""
  317. url = parse_url(folder)
  318. if url:
  319. for link in links:
  320. if (base_url(link['url']) in url) or (url in link['url']):
  321. return link
  322. timestamp = folder.split('.')[0]
  323. for link in links:
  324. if link['timestamp'].startswith(timestamp):
  325. if domain(link['url']) in os.listdir(os.path.join(ARCHIVE_DIR, folder)):
  326. return link # careful now, this isn't safe for most ppl
  327. if domain(link['url']) in parse_url(folder):
  328. return link
  329. return None
  330. def parse_url(folder):
  331. """for a given archive folder, figure out what url it's for"""
  332. link_json = os.path.join(ARCHIVE_DIR, folder, 'index.json')
  333. if os.path.exists(link_json):
  334. with open(link_json, 'r') as f:
  335. try:
  336. link_json = f.read().strip()
  337. if link_json:
  338. link = json.loads(link_json)
  339. return base_url(link['url'])
  340. except ValueError:
  341. print('File contains invalid JSON: {}!'.format(link_json))
  342. archive_org_txt = os.path.join(ARCHIVE_DIR, folder, 'archive.org.txt')
  343. if os.path.exists(archive_org_txt):
  344. with open(archive_org_txt, 'r') as f:
  345. original_link = f.read().strip().split('/http', 1)[-1]
  346. with_scheme = 'http{}'.format(original_link)
  347. return with_scheme
  348. return ''
  349. def manually_merge_folders(source, target):
  350. """prompt for user input to resolve a conflict between two archive folders"""
  351. if not IS_TTY:
  352. return
  353. fname = lambda path: path.split('/')[-1]
  354. print(' {} and {} have conflicting files, which do you want to keep?'.format(fname(source), fname(target)))
  355. print(' - [enter]: do nothing (keep both)')
  356. print(' - a: prefer files from {}'.format(source))
  357. print(' - b: prefer files from {}'.format(target))
  358. print(' - q: quit and resolve the conflict manually')
  359. try:
  360. answer = input('> ').strip().lower()
  361. except KeyboardInterrupt:
  362. answer = 'q'
  363. assert answer in ('', 'a', 'b', 'q'), 'Invalid choice.'
  364. if answer == 'q':
  365. print('\nJust run ArchiveBox again to pick up where you left off.')
  366. raise SystemExit(0)
  367. elif answer == '':
  368. return
  369. files_in_source = set(os.listdir(source))
  370. files_in_target = set(os.listdir(target))
  371. for file in files_in_source:
  372. if file in files_in_target:
  373. to_delete = target if answer == 'a' else source
  374. run(['rm', '-Rf', os.path.join(to_delete, file)])
  375. run(['mv', os.path.join(source, file), os.path.join(target, file)])
  376. if not set(os.listdir(source)):
  377. run(['rm', '-Rf', source])
  378. def fix_folder_path(archive_path, link_folder, link):
  379. """given a folder, merge it to the canonical 'correct' path for the given link object"""
  380. source = os.path.join(archive_path, link_folder)
  381. target = os.path.join(archive_path, link['timestamp'])
  382. url_in_folder = parse_url(source)
  383. if not (url_in_folder in base_url(link['url'])
  384. or base_url(link['url']) in url_in_folder):
  385. raise ValueError('The link does not match the url for this folder.')
  386. if not os.path.exists(target):
  387. # target doesn't exist so nothing needs merging, simply move A to B
  388. run(['mv', source, target])
  389. else:
  390. # target folder exists, check for conflicting files and attempt manual merge
  391. files_in_source = set(os.listdir(source))
  392. files_in_target = set(os.listdir(target))
  393. conflicting_files = files_in_source & files_in_target
  394. if not conflicting_files:
  395. for file in files_in_source:
  396. run(['mv', os.path.join(source, file), os.path.join(target, file)])
  397. if os.path.exists(source):
  398. files_in_source = set(os.listdir(source))
  399. if files_in_source:
  400. manually_merge_folders(source, target)
  401. else:
  402. run(['rm', '-R', source])
  403. def migrate_data():
  404. # migrate old folder to new OUTPUT folder
  405. old_dir = os.path.join(REPO_DIR, 'html')
  406. if os.path.exists(old_dir):
  407. print('[!] WARNING: Moved old output folder "html" to new location: {}'.format(OUTPUT_DIR))
  408. run(['mv', old_dir, OUTPUT_DIR], timeout=10)
  409. def cleanup_archive(archive_path, links):
  410. """move any incorrectly named folders to their canonical locations"""
  411. # for each folder that exists, see if we can match it up with a known good link
  412. # if we can, then merge the two folders (TODO: if not, move it to lost & found)
  413. unmatched = []
  414. bad_folders = []
  415. if not os.path.exists(archive_path):
  416. return
  417. for folder in os.listdir(archive_path):
  418. try:
  419. files = os.listdir(os.path.join(archive_path, folder))
  420. except NotADirectoryError:
  421. continue
  422. if files:
  423. link = find_link(folder, links)
  424. if link is None:
  425. unmatched.append(folder)
  426. continue
  427. if folder != link['timestamp']:
  428. bad_folders.append((folder, link))
  429. else:
  430. # delete empty folders
  431. run(['rm', '-R', os.path.join(archive_path, folder)])
  432. if bad_folders and IS_TTY and input('[!] Cleanup archive? y/[n]: ') == 'y':
  433. print('[!] Fixing {} improperly named folders in archive...'.format(len(bad_folders)))
  434. for folder, link in bad_folders:
  435. fix_folder_path(archive_path, folder, link)
  436. elif bad_folders:
  437. print('[!] Warning! {} folders need to be merged, fix by running ArchiveBox.'.format(len(bad_folders)))
  438. if unmatched:
  439. print('[!] Warning! {} unrecognized folders in html/archive/'.format(len(unmatched)))
  440. print(' '+ '\n '.join(unmatched))
  441. def wget_output_path(link, look_in=None):
  442. """calculate the path to the wgetted .html file, since wget may
  443. adjust some paths to be different than the base_url path.
  444. See docs on wget --adjust-extension (-E)
  445. """
  446. # if we have it stored, always prefer the actual output path to computed one
  447. if link.get('latest', {}).get('wget'):
  448. return link['latest']['wget']
  449. urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
  450. if link['type'] in ('PDF', 'image'):
  451. return urlencode(base_url(link['url']))
  452. # Since the wget algorithm to for -E (appending .html) is incredibly complex
  453. # instead of trying to emulate it here, we just look in the output folder
  454. # to see what html file wget actually created as the output
  455. wget_folder = base_url(link['url']).rsplit('/', 1)[0].split('/')
  456. look_in = os.path.join(ARCHIVE_DIR, link['timestamp'], *wget_folder)
  457. if look_in and os.path.exists(look_in):
  458. html_files = [
  459. f for f in os.listdir(look_in)
  460. if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
  461. ]
  462. if html_files:
  463. return urlencode(os.path.join(*wget_folder, html_files[0]))
  464. return None
  465. # If finding the actual output file didn't work, fall back to the buggy
  466. # implementation of the wget .html appending algorithm
  467. # split_url = link['url'].split('#', 1)
  468. # query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else ''
  469. # if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
  470. # # already ends in .html
  471. # return urlencode(base_url(link['url']))
  472. # else:
  473. # # .html needs to be appended
  474. # without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
  475. # if without_scheme.endswith('/'):
  476. # if query:
  477. # return urlencode('#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]]))
  478. # return urlencode('#'.join([without_scheme + 'index.html', *split_url[1:]]))
  479. # else:
  480. # if query:
  481. # return urlencode('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]]))
  482. # elif '/' in without_scheme:
  483. # return urlencode('#'.join([without_scheme + '.html', *split_url[1:]]))
  484. # return urlencode(base_url(link['url']) + '/index.html')
  485. def derived_link_info(link):
  486. """extend link info with the archive urls and other derived data"""
  487. url = link['url']
  488. extended_info = {
  489. **link,
  490. 'title': link['title'] or base_url(url),
  491. 'date': datetime.fromtimestamp(Decimal(link['timestamp'])).strftime('%Y-%m-%d %H:%M'),
  492. 'base_url': base_url(url),
  493. 'domain': domain(url),
  494. 'basename': basename(url),
  495. 'path': path(url),
  496. }
  497. # Archive Method Output URLs
  498. extended_info = {
  499. **extended_info,
  500. 'favicon_url': 'archive/{timestamp}/favicon.ico'.format(**extended_info),
  501. 'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**extended_info),
  502. 'files_url': 'archive/{timestamp}/index.html'.format(**extended_info),
  503. 'archive_url': 'archive/{}/{}'.format(link['timestamp'], wget_output_path(link) or 'index.html'),
  504. 'warc_url': 'archive/{timestamp}/warc'.format(**extended_info),
  505. 'pdf_link': 'archive/{timestamp}/output.pdf'.format(**extended_info),
  506. 'screenshot_link': 'archive/{timestamp}/screenshot.png'.format(**extended_info),
  507. 'dom_link': 'archive/{timestamp}/output.html'.format(**extended_info),
  508. 'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**extended_info),
  509. 'git_url': 'archive/{timestamp}/git'.format(**extended_info),
  510. 'media_url': 'archive/{timestamp}/media'.format(**extended_info),
  511. }
  512. # PDF and images are handled slightly differently
  513. # wget, screenshot, & pdf urls all point to the same file
  514. if link['type'] in ('PDF', 'image'):
  515. extended_info.update({
  516. 'archive_url': 'archive/{timestamp}/{base_url}'.format(**extended_info),
  517. 'pdf_link': 'archive/{timestamp}/{base_url}'.format(**extended_info),
  518. 'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**extended_info),
  519. 'dom_link': 'archive/{timestamp}/{base_url}'.format(**extended_info),
  520. 'title': link['title'] or basename(link['url']),
  521. })
  522. return extended_info
  523. def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
  524. """Patched of subprocess.run to fix blocking io making timeout=innefective"""
  525. if input is not None:
  526. if 'stdin' in kwargs:
  527. raise ValueError('stdin and input arguments may not both be used.')
  528. kwargs['stdin'] = PIPE
  529. if capture_output:
  530. if ('stdout' in kwargs) or ('stderr' in kwargs):
  531. raise ValueError('stdout and stderr arguments may not be used '
  532. 'with capture_output.')
  533. kwargs['stdout'] = PIPE
  534. kwargs['stderr'] = PIPE
  535. with Popen(*popenargs, **kwargs) as process:
  536. try:
  537. stdout, stderr = process.communicate(input, timeout=timeout)
  538. except TimeoutExpired:
  539. process.kill()
  540. try:
  541. stdout, stderr = process.communicate(input, timeout=2)
  542. except:
  543. pass
  544. raise TimeoutExpired(popenargs[0][0], timeout)
  545. except BaseException as err:
  546. process.kill()
  547. # We don't call process.wait() as .__exit__ does that for us.
  548. raise
  549. retcode = process.poll()
  550. if check and retcode:
  551. raise CalledProcessError(retcode, process.args,
  552. output=stdout, stderr=stderr)
  553. return CompletedProcess(process.args, retcode, stdout, stderr)
  554. def check_link_structure(link):
  555. assert isinstance(link, dict)
  556. assert isinstance(link.get('url'), str)
  557. assert len(link['url']) > 2
  558. assert len(re.findall(URL_REGEX, link['url'])) == 1
  559. def check_links_structure(links):
  560. assert isinstance(links, list)
  561. if links:
  562. check_link_structure(links[0])