archive_methods.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509
  1. import os
  2. import sys
  3. from functools import wraps
  4. from collections import defaultdict
  5. from datetime import datetime
  6. from subprocess import run, PIPE, DEVNULL
  7. from peekable import Peekable
  8. from index import wget_output_path, parse_json_link_index, write_link_index
  9. from links import links_after_timestamp
  10. from config import (
  11. CHROME_BINARY,
  12. FETCH_WGET,
  13. FETCH_WGET_REQUISITES,
  14. FETCH_PDF,
  15. FETCH_SCREENSHOT,
  16. FETCH_DOM,
  17. RESOLUTION,
  18. CHECK_SSL_VALIDITY,
  19. SUBMIT_ARCHIVE_DOT_ORG,
  20. FETCH_AUDIO,
  21. FETCH_VIDEO,
  22. FETCH_FAVICON,
  23. WGET_USER_AGENT,
  24. CHROME_USER_DATA_DIR,
  25. CHROME_SANDBOX,
  26. TIMEOUT,
  27. ANSI,
  28. ARCHIVE_DIR,
  29. )
  30. from util import (
  31. check_dependencies,
  32. progress,
  33. chmod_file,
  34. pretty_path,
  35. )
  36. _RESULTS_TOTALS = { # globals are bad, mmkay
  37. 'skipped': 0,
  38. 'succeded': 0,
  39. 'failed': 0,
  40. }
  41. def archive_links(archive_path, links, source=None, resume=None):
  42. check_dependencies()
  43. to_archive = Peekable(links_after_timestamp(links, resume))
  44. idx, link = 0, to_archive.peek(0)
  45. try:
  46. for idx, link in enumerate(to_archive):
  47. link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
  48. archive_link(link_dir, link)
  49. except (KeyboardInterrupt, SystemExit, Exception) as e:
  50. print('{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
  51. **ANSI,
  52. now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  53. idx=idx+1,
  54. timestamp=link['timestamp'],
  55. total=len(links),
  56. ))
  57. print(' Continue where you left off by running:')
  58. print(' {} {}'.format(
  59. pretty_path(sys.argv[0]),
  60. link['timestamp'],
  61. ))
  62. if not isinstance(e, KeyboardInterrupt):
  63. raise e
  64. raise SystemExit(1)
  65. def archive_link(link_dir, link, overwrite=True):
  66. """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
  67. update_existing = os.path.exists(link_dir)
  68. if update_existing:
  69. link = {
  70. **parse_json_link_index(link_dir),
  71. **link,
  72. }
  73. else:
  74. os.makedirs(link_dir)
  75. log_link_archive(link_dir, link, update_existing)
  76. if FETCH_WGET:
  77. link = fetch_wget(link_dir, link, overwrite=overwrite)
  78. if FETCH_PDF:
  79. link = fetch_pdf(link_dir, link, overwrite=overwrite)
  80. if FETCH_SCREENSHOT:
  81. link = fetch_screenshot(link_dir, link, overwrite=overwrite)
  82. if FETCH_DOM:
  83. link = fetch_dom(link_dir, link, overwrite=overwrite)
  84. if SUBMIT_ARCHIVE_DOT_ORG:
  85. link = archive_dot_org(link_dir, link, overwrite=overwrite)
  86. # if FETCH_AUDIO:
  87. # link = fetch_audio(link_dir, link, overwrite=overwrite)
  88. # if FETCH_VIDEO:
  89. # link = fetch_video(link_dir, link, overwrite=overwrite)
  90. if FETCH_FAVICON:
  91. link = fetch_favicon(link_dir, link, overwrite=overwrite)
  92. write_link_index(link_dir, link)
  93. # print()
  94. return link
  95. def log_link_archive(link_dir, link, update_existing):
  96. print('[{symbol_color}{symbol}{reset}] [{now}] "{title}"\n {blue}{url}{reset}'.format(
  97. symbol='*' if update_existing else '+',
  98. symbol_color=ANSI['black' if update_existing else 'green'],
  99. now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  100. **link,
  101. **ANSI,
  102. ))
  103. print(' > {}{}'.format(pretty_path(link_dir), '' if update_existing else ' (new)'))
  104. if link['type']:
  105. print(' i {}'.format(link['type']))
  106. def attach_result_to_link(method):
  107. """
  108. Instead of returning a result={output:'...', status:'success'} object,
  109. attach that result to the links's history & latest fields, then return
  110. the updated link object.
  111. """
  112. def decorator(fetch_func):
  113. @wraps(fetch_func)
  114. def timed_fetch_func(link_dir, link, overwrite=False, **kwargs):
  115. # initialize methods and history json field on link
  116. link['latest'] = link.get('latest') or {}
  117. link['latest'][method] = link['latest'].get(method) or None
  118. link['history'] = link.get('history') or {}
  119. link['history'][method] = link['history'].get(method) or []
  120. start_ts = datetime.now().timestamp()
  121. # if a valid method output is already present, dont run the fetch function
  122. if link['latest'][method] and not overwrite:
  123. print(' √ {}'.format(method))
  124. result = None
  125. else:
  126. print(' > {}'.format(method))
  127. result = fetch_func(link_dir, link, **kwargs)
  128. end_ts = datetime.now().timestamp()
  129. duration = str(end_ts * 1000 - start_ts * 1000).split('.')[0]
  130. # append a history item recording fail/success
  131. history_entry = {
  132. 'timestamp': str(start_ts).split('.')[0],
  133. }
  134. if result is None:
  135. history_entry['status'] = 'skipped'
  136. elif isinstance(result.get('output'), Exception):
  137. history_entry['status'] = 'failed'
  138. history_entry['duration'] = duration
  139. history_entry.update(result or {})
  140. link['history'][method].append(history_entry)
  141. else:
  142. history_entry['status'] = 'succeded'
  143. history_entry['duration'] = duration
  144. history_entry.update(result or {})
  145. link['history'][method].append(history_entry)
  146. link['latest'][method] = result['output']
  147. _RESULTS_TOTALS[history_entry['status']] += 1
  148. return link
  149. return timed_fetch_func
  150. return decorator
  151. @attach_result_to_link('wget')
  152. def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT):
  153. """download full site using wget"""
  154. domain_dir = os.path.join(link_dir, link['domain'])
  155. existing_file = wget_output_path(link)
  156. if os.path.exists(domain_dir) and existing_file:
  157. return {'output': existing_file, 'status': 'skipped'}
  158. CMD = [
  159. # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
  160. *'wget -N -E -np -x -H -k -K -S --restrict-file-names=unix'.split(' '),
  161. *(('-p',) if FETCH_WGET_REQUISITES else ()),
  162. *(('--user-agent={}'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()),
  163. *((() if CHECK_SSL_VALIDITY else ('--no-check-certificate',))),
  164. link['url'],
  165. ]
  166. end = progress(timeout, prefix=' ')
  167. try:
  168. result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # index.html
  169. end()
  170. output = wget_output_path(link, look_in=domain_dir)
  171. # Check for common failure cases
  172. if result.returncode > 0:
  173. print(' got wget response code {}:'.format(result.returncode))
  174. if result.returncode != 8:
  175. print('\n'.join(' ' + line for line in (result.stderr or result.stdout).decode().rsplit('\n', 10)[-10:] if line.strip()))
  176. if b'403: Forbidden' in result.stderr:
  177. raise Exception('403 Forbidden (try changing WGET_USER_AGENT)')
  178. if b'404: Not Found' in result.stderr:
  179. raise Exception('404 Not Found')
  180. if b'ERROR 500: Internal Server Error' in result.stderr:
  181. raise Exception('500 Internal Server Error')
  182. if result.returncode == 4:
  183. raise Exception('Failed wget download')
  184. except Exception as e:
  185. end()
  186. print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
  187. print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  188. output = e
  189. return {
  190. 'cmd': CMD,
  191. 'output': output,
  192. }
  193. @attach_result_to_link('pdf')
  194. def fetch_pdf(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR):
  195. """print PDF of site to file using chrome --headless"""
  196. if link['type'] in ('PDF', 'image'):
  197. return {'output': wget_output_path(link)}
  198. if os.path.exists(os.path.join(link_dir, 'output.pdf')):
  199. return {'output': 'output.pdf', 'status': 'skipped'}
  200. CMD = [
  201. *chrome_headless(user_data_dir=user_data_dir),
  202. '--print-to-pdf',
  203. link['url']
  204. ]
  205. end = progress(timeout, prefix=' ')
  206. try:
  207. result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # output.pdf
  208. end()
  209. if result.returncode:
  210. print(' ', (result.stderr or result.stdout).decode())
  211. raise Exception('Failed to print PDF')
  212. chmod_file('output.pdf', cwd=link_dir)
  213. output = 'output.pdf'
  214. except Exception as e:
  215. end()
  216. print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
  217. print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  218. output = e
  219. return {
  220. 'cmd': CMD,
  221. 'output': output,
  222. }
  223. @attach_result_to_link('screenshot')
  224. def fetch_screenshot(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR, resolution=RESOLUTION):
  225. """take screenshot of site using chrome --headless"""
  226. if link['type'] in ('PDF', 'image'):
  227. return {'output': wget_output_path(link)}
  228. if os.path.exists(os.path.join(link_dir, 'screenshot.png')):
  229. return {'output': 'screenshot.png', 'status': 'skipped'}
  230. CMD = [
  231. *chrome_headless(user_data_dir=user_data_dir),
  232. '--screenshot',
  233. '--window-size={}'.format(resolution),
  234. '--hide-scrollbars',
  235. # '--full-page', # TODO: make this actually work using ./bin/screenshot fullPage: true
  236. link['url'],
  237. ]
  238. end = progress(timeout, prefix=' ')
  239. try:
  240. result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # sreenshot.png
  241. end()
  242. if result.returncode:
  243. print(' ', (result.stderr or result.stdout).decode())
  244. raise Exception('Failed to take screenshot')
  245. chmod_file('screenshot.png', cwd=link_dir)
  246. output = 'screenshot.png'
  247. except Exception as e:
  248. end()
  249. print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
  250. print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  251. output = e
  252. return {
  253. 'cmd': CMD,
  254. 'output': output,
  255. }
  256. @attach_result_to_link('dom')
  257. def fetch_dom(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR):
  258. """print HTML of site to file using chrome --dump-html"""
  259. if link['type'] in ('PDF', 'image'):
  260. return {'output': wget_output_path(link)}
  261. output_path = os.path.join(link_dir, 'output.html')
  262. if os.path.exists(output_path):
  263. return {'output': 'output.html', 'status': 'skipped'}
  264. CMD = [
  265. *chrome_headless(user_data_dir=user_data_dir),
  266. '--dump-dom',
  267. link['url']
  268. ]
  269. end = progress(timeout, prefix=' ')
  270. try:
  271. with open(output_path, 'w+') as f:
  272. result = run(CMD, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # output.html
  273. end()
  274. if result.returncode:
  275. print(' ', (result.stderr).decode())
  276. raise Exception('Failed to fetch DOM')
  277. chmod_file('output.html', cwd=link_dir)
  278. output = 'output.html'
  279. except Exception as e:
  280. end()
  281. print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
  282. print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  283. output = e
  284. return {
  285. 'cmd': CMD,
  286. 'output': output,
  287. }
  288. @attach_result_to_link('archive_org')
  289. def archive_dot_org(link_dir, link, timeout=TIMEOUT):
  290. """submit site to archive.org for archiving via their service, save returned archive url"""
  291. path = os.path.join(link_dir, 'archive.org.txt')
  292. if os.path.exists(path):
  293. archive_org_url = open(path, 'r').read().strip()
  294. return {'output': archive_org_url, 'status': 'skipped'}
  295. submit_url = 'https://web.archive.org/save/{}'.format(link['url'])
  296. success = False
  297. CMD = ['curl', '-L', '-I', '-X', 'GET', submit_url]
  298. end = progress(timeout, prefix=' ')
  299. try:
  300. result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1) # archive.org.txt
  301. end()
  302. # Parse archive.org response headers
  303. headers = defaultdict(list)
  304. # lowercase all the header names and store in dict
  305. for header in result.stdout.splitlines():
  306. if b':' not in header or not header.strip():
  307. continue
  308. name, val = header.decode().split(':', 1)
  309. headers[name.lower().strip()].append(val.strip())
  310. # Get successful archive url in "content-location" header or any errors
  311. content_location = headers['content-location']
  312. errors = headers['x-archive-wayback-runtime-error']
  313. if content_location:
  314. saved_url = 'https://web.archive.org{}'.format(content_location[0])
  315. success = True
  316. elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
  317. output = submit_url
  318. # raise Exception('Archive.org denied by {}/robots.txt'.format(link['domain']))
  319. elif errors:
  320. raise Exception(', '.join(errors))
  321. else:
  322. raise Exception('Failed to find "content-location" URL header in Archive.org response.')
  323. except Exception as e:
  324. end()
  325. print(' Visit url to see output:', ' '.join(CMD))
  326. print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  327. output = e
  328. if success:
  329. with open(os.path.join(link_dir, 'archive.org.txt'), 'w', encoding='utf-8') as f:
  330. f.write(saved_url)
  331. chmod_file('archive.org.txt', cwd=link_dir)
  332. output = saved_url
  333. return {
  334. 'cmd': CMD,
  335. 'output': output,
  336. }
  337. @attach_result_to_link('favicon')
  338. def fetch_favicon(link_dir, link, timeout=TIMEOUT):
  339. """download site favicon from google's favicon api"""
  340. if os.path.exists(os.path.join(link_dir, 'favicon.ico')):
  341. return {'output': 'favicon.ico', 'status': 'skipped'}
  342. CMD = ['curl', 'https://www.google.com/s2/favicons?domain={domain}'.format(**link)]
  343. fout = open('{}/favicon.ico'.format(link_dir), 'w')
  344. end = progress(timeout, prefix=' ')
  345. try:
  346. run(CMD, stdout=fout, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1) # favicon.ico
  347. fout.close()
  348. end()
  349. chmod_file('favicon.ico', cwd=link_dir)
  350. output = 'favicon.ico'
  351. except Exception as e:
  352. fout.close()
  353. end()
  354. print(' Run to see full output:', ' '.join(CMD))
  355. print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  356. output = e
  357. return {
  358. 'cmd': CMD,
  359. 'output': output,
  360. }
  361. # @attach_result_to_link('audio')
  362. # def fetch_audio(link_dir, link, timeout=TIMEOUT):
  363. # """Download audio rip using youtube-dl"""
  364. # if link['type'] not in ('soundcloud',)\
  365. # and 'audio' not in link['tags']:
  366. # return
  367. # path = os.path.join(link_dir, 'audio')
  368. # if not os.path.exists(path) or overwrite:
  369. # print(' - Downloading audio')
  370. # CMD = [
  371. # "youtube-dl -x --audio-format mp3 --audio-quality 0 -o '%(title)s.%(ext)s'",
  372. # link['url'],
  373. # ]
  374. # end = progress(timeout, prefix=' ')
  375. # try:
  376. # result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1) # audio/audio.mp3
  377. # end()
  378. # if result.returncode:
  379. # print(' ', result.stderr.decode())
  380. # raise Exception('Failed to download audio')
  381. # chmod_file('audio.mp3', cwd=link_dir)
  382. # return 'audio.mp3'
  383. # except Exception as e:
  384. # end()
  385. # print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
  386. # print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  387. # raise
  388. # else:
  389. # print(' √ Skipping audio download')
  390. # @attach_result_to_link('video')
  391. # def fetch_video(link_dir, link, timeout=TIMEOUT):
  392. # """Download video rip using youtube-dl"""
  393. # if link['type'] not in ('youtube', 'youku', 'vimeo')\
  394. # and 'video' not in link['tags']:
  395. # return
  396. # path = os.path.join(link_dir, 'video')
  397. # if not os.path.exists(path) or overwrite:
  398. # print(' - Downloading video')
  399. # CMD = [
  400. # "youtube-dl -x --video-format mp4 --audio-quality 0 -o '%(title)s.%(ext)s'",
  401. # link['url'],
  402. # ]
  403. # end = progress(timeout, prefix=' ')
  404. # try:
  405. # result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1) # video/movie.mp4
  406. # end()
  407. # if result.returncode:
  408. # print(' ', result.stderr.decode())
  409. # raise Exception('Failed to download video')
  410. # chmod_file('video.mp4', cwd=link_dir)
  411. # return 'video.mp4'
  412. # except Exception as e:
  413. # end()
  414. # print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
  415. # print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
  416. # raise
  417. # else:
  418. # print(' √ Skipping video download')
  419. def chrome_headless(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR):
  420. args = [binary, '--headless'] # '--disable-gpu'
  421. if not CHROME_SANDBOX:
  422. args.append('--no-sandbox')
  423. default_profile = os.path.expanduser('~/Library/Application Support/Google/Chrome/Default')
  424. if user_data_dir:
  425. args.append('--user-data-dir={}'.format(user_data_dir))
  426. elif os.path.exists(default_profile):
  427. args.append('--user-data-dir={}'.format(default_profile))
  428. return args