|
|
@@ -2,22 +2,18 @@ import os
|
|
|
import re
|
|
|
import sys
|
|
|
import time
|
|
|
-import json
|
|
|
-from urllib.request import Request, urlopen
|
|
|
-from urllib.parse import urlparse
|
|
|
|
|
|
+from urllib.request import Request, urlopen
|
|
|
+from urllib.parse import urlparse, quote
|
|
|
from decimal import Decimal
|
|
|
-from urllib.parse import quote
|
|
|
from datetime import datetime
|
|
|
-from subprocess import TimeoutExpired, Popen, PIPE, DEVNULL, CompletedProcess, CalledProcessError
|
|
|
from multiprocessing import Process
|
|
|
+from subprocess import TimeoutExpired, Popen, PIPE, DEVNULL, CompletedProcess, CalledProcessError
|
|
|
|
|
|
from config import (
|
|
|
ANSI,
|
|
|
- IS_TTY,
|
|
|
TERM_WIDTH,
|
|
|
REPO_DIR,
|
|
|
- OUTPUT_DIR,
|
|
|
SOURCES_DIR,
|
|
|
ARCHIVE_DIR,
|
|
|
OUTPUT_PERMISSIONS,
|
|
|
@@ -42,7 +38,9 @@ from config import (
|
|
|
SUBMIT_ARCHIVE_DOT_ORG,
|
|
|
)
|
|
|
|
|
|
-# URL helpers: https://docs.python.org/3/library/urllib.parse.html#url-parsing
|
|
|
+### Parsing Helpers
|
|
|
+
|
|
|
+# Url Parsing: https://docs.python.org/3/library/urllib.parse.html#url-parsing
|
|
|
scheme = lambda url: urlparse(url).scheme
|
|
|
without_scheme = lambda url: urlparse(url)._replace(scheme='').geturl().strip('//')
|
|
|
without_query = lambda url: urlparse(url)._replace(query='').geturl().strip('//')
|
|
|
@@ -72,6 +70,20 @@ HTML_TITLE_REGEX = re.compile(
|
|
|
re.IGNORECASE,
|
|
|
)
|
|
|
|
|
|
+### Checks & Tests
|
|
|
+
|
|
|
+def check_link_structure(link):
|
|
|
+ """basic sanity check invariants to make sure the data is valid"""
|
|
|
+ assert isinstance(link, dict)
|
|
|
+ assert isinstance(link.get('url'), str)
|
|
|
+ assert len(link['url']) > 2
|
|
|
+ assert len(re.findall(URL_REGEX, link['url'])) == 1
|
|
|
+
|
|
|
+def check_links_structure(links):
|
|
|
+ """basic sanity check invariants to make sure the data is valid"""
|
|
|
+ assert isinstance(links, list)
|
|
|
+ if links:
|
|
|
+ check_link_structure(links[0])
|
|
|
|
|
|
def check_dependencies():
|
|
|
"""Check that all necessary dependencies are installed, and have valid versions"""
|
|
|
@@ -134,7 +146,6 @@ def check_dependencies():
|
|
|
print(' See https://github.com/pirate/ArchiveBox for help.')
|
|
|
raise SystemExit(1)
|
|
|
|
|
|
-
|
|
|
def check_url_parsing():
|
|
|
"""Check that plain text regex URL parsing works as expected"""
|
|
|
test_urls = '''
|
|
|
@@ -159,111 +170,7 @@ def check_url_parsing():
|
|
|
assert len(re.findall(URL_REGEX, test_urls)) == 12
|
|
|
|
|
|
|
|
|
-def print_error_hints(cmd, pwd, err=None, hints=None, prefix=' '):
|
|
|
- """quote the argument with whitespace in a command so the user can
|
|
|
- copy-paste the outputted string directly to run the cmd
|
|
|
- """
|
|
|
-
|
|
|
- quoted_cmd = ' '.join(
|
|
|
- '"{}"'.format(arg) if ' ' in arg else arg
|
|
|
- for arg in cmd
|
|
|
- )
|
|
|
-
|
|
|
- output_lines = [
|
|
|
- '{}Failed: {} {}{}'.format(ANSI['red'], err.__class__.__name__, err, ANSI['reset']),
|
|
|
- ' {}{}{}'.format(ANSI['lightyellow'], hints, ANSI['reset']) if hints else None,
|
|
|
- 'Run to see full output:'
|
|
|
- ' cd {};'.format(pwd),
|
|
|
- ' {}'.format(quoted_cmd),
|
|
|
- ]
|
|
|
-
|
|
|
- return '\n'.join(
|
|
|
- '{}{}'.format(prefix, line)
|
|
|
- for line in output_lines
|
|
|
- if line
|
|
|
- )
|
|
|
-
|
|
|
-
|
|
|
-def chmod_file(path, cwd='.', permissions=OUTPUT_PERMISSIONS, timeout=30):
|
|
|
- """chmod -R <permissions> <cwd>/<path>"""
|
|
|
-
|
|
|
- if not os.path.exists(os.path.join(cwd, path)):
|
|
|
- raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
|
|
|
-
|
|
|
- chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout)
|
|
|
- if chmod_result.returncode == 1:
|
|
|
- print(' ', chmod_result.stderr.decode())
|
|
|
- raise Exception('Failed to chmod {}/{}'.format(cwd, path))
|
|
|
-
|
|
|
-
|
|
|
-def progress(seconds=TIMEOUT, prefix=''):
|
|
|
- """Show a (subprocess-controlled) progress bar with a <seconds> timeout,
|
|
|
- returns end() function to instantly finish the progress
|
|
|
- """
|
|
|
-
|
|
|
- if not SHOW_PROGRESS:
|
|
|
- return lambda: None
|
|
|
-
|
|
|
- def progress_bar(seconds, prefix):
|
|
|
- """show timer in the form of progress bar, with percentage and seconds remaining"""
|
|
|
- chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#'
|
|
|
- chunks = TERM_WIDTH - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
|
|
|
- try:
|
|
|
- for s in range(seconds * chunks):
|
|
|
- progress = s / chunks / seconds * 100
|
|
|
- bar_width = round(progress/(100/chunks))
|
|
|
-
|
|
|
- # ████████████████████ 0.9% (1/60sec)
|
|
|
- sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
|
|
|
- prefix,
|
|
|
- ANSI['green'],
|
|
|
- (chunk * bar_width).ljust(chunks),
|
|
|
- ANSI['reset'],
|
|
|
- round(progress, 1),
|
|
|
- round(s/chunks),
|
|
|
- seconds,
|
|
|
- ))
|
|
|
- sys.stdout.flush()
|
|
|
- time.sleep(1 / chunks)
|
|
|
-
|
|
|
- # ██████████████████████████████████ 100.0% (60/60sec)
|
|
|
- sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format(
|
|
|
- prefix,
|
|
|
- ANSI['red'],
|
|
|
- chunk * chunks,
|
|
|
- ANSI['reset'],
|
|
|
- 100.0,
|
|
|
- seconds,
|
|
|
- seconds,
|
|
|
- ))
|
|
|
- sys.stdout.flush()
|
|
|
- except KeyboardInterrupt:
|
|
|
- print()
|
|
|
- pass
|
|
|
-
|
|
|
- p = Process(target=progress_bar, args=(seconds, prefix))
|
|
|
- p.start()
|
|
|
-
|
|
|
- def end():
|
|
|
- """immediately finish progress and clear the progressbar line"""
|
|
|
-
|
|
|
- # protect from double termination
|
|
|
- #if p is None or not hasattr(p, 'kill'):
|
|
|
- # return
|
|
|
- nonlocal p
|
|
|
- if p is not None:
|
|
|
- p.terminate()
|
|
|
- p = None
|
|
|
-
|
|
|
- sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH), ANSI['reset'])) # clear whole terminal line
|
|
|
- sys.stdout.flush()
|
|
|
-
|
|
|
- return end
|
|
|
-
|
|
|
-def pretty_path(path):
|
|
|
- """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
|
|
|
- return path.replace(REPO_DIR + '/', '')
|
|
|
-
|
|
|
+### Random Helpers
|
|
|
|
|
|
def save_stdin_source(raw_text):
|
|
|
if not os.path.exists(SOURCES_DIR):
|
|
|
@@ -278,23 +185,8 @@ def save_stdin_source(raw_text):
|
|
|
|
|
|
return source_path
|
|
|
|
|
|
-
|
|
|
-def fetch_page_content(url, timeout=TIMEOUT):
|
|
|
- req = Request(url, headers={'User-Agent': WGET_USER_AGENT})
|
|
|
-
|
|
|
- if CHECK_SSL_VALIDITY:
|
|
|
- resp = urlopen(req, timeout=timeout)
|
|
|
- else:
|
|
|
- import ssl
|
|
|
- insecure = ssl._create_unverified_context()
|
|
|
- resp = urlopen(req, timeout=timeout, context=insecure)
|
|
|
-
|
|
|
- encoding = resp.headers.get_content_charset() or 'utf-8'
|
|
|
- return resp.read().decode(encoding)
|
|
|
-
|
|
|
-
|
|
|
def save_remote_source(url, timeout=TIMEOUT):
|
|
|
- """download a given url's content into downloads/domain.txt"""
|
|
|
+ """download a given url's content into output/sources/domain-<timestamp>.txt"""
|
|
|
|
|
|
if not os.path.exists(SOURCES_DIR):
|
|
|
os.makedirs(SOURCES_DIR)
|
|
|
@@ -311,7 +203,7 @@ def save_remote_source(url, timeout=TIMEOUT):
|
|
|
))
|
|
|
end = progress(TIMEOUT, prefix=' ')
|
|
|
try:
|
|
|
- downloaded_xml = fetch_page_content(url, timeout=timeout)
|
|
|
+ downloaded_xml = download_url(url, timeout=timeout)
|
|
|
end()
|
|
|
except Exception as e:
|
|
|
end()
|
|
|
@@ -330,7 +222,6 @@ def save_remote_source(url, timeout=TIMEOUT):
|
|
|
|
|
|
return source_path
|
|
|
|
|
|
-
|
|
|
def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
|
|
|
"""Attempt to guess a page's title by downloading the html"""
|
|
|
if not FETCH_TITLE:
|
|
|
@@ -341,7 +232,7 @@ def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
|
|
|
sys.stdout.write('.')
|
|
|
sys.stdout.flush()
|
|
|
|
|
|
- html = fetch_page_content(url, timeout=timeout)
|
|
|
+ html = download_url(url, timeout=timeout)
|
|
|
|
|
|
match = re.search(HTML_TITLE_REGEX, html)
|
|
|
return match.group(1).strip() if match else None
|
|
|
@@ -352,215 +243,6 @@ def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
|
|
|
# ))
|
|
|
return None
|
|
|
|
|
|
-
|
|
|
-def str_between(string, start, end=None):
|
|
|
- """(<abc>12345</def>, <abc>, </def>) -> 12345"""
|
|
|
-
|
|
|
- content = string.split(start, 1)[-1]
|
|
|
- if end is not None:
|
|
|
- content = content.rsplit(end, 1)[0]
|
|
|
-
|
|
|
- return content
|
|
|
-
|
|
|
-def get_link_type(link):
|
|
|
- """Certain types of links need to be handled specially, this figures out when that's the case"""
|
|
|
-
|
|
|
- if extension(link['url']) == 'pdf':
|
|
|
- return 'PDF'
|
|
|
- elif extension(link['url']) in ('pdf', 'png', 'jpg', 'jpeg', 'svg', 'bmp', 'gif', 'tiff', 'webp'):
|
|
|
- return 'image'
|
|
|
- elif 'wikipedia.org' in domain(link['url']).lower():
|
|
|
- return 'wiki'
|
|
|
- elif 'youtube.com' in domain(link['url']).lower():
|
|
|
- return 'youtube'
|
|
|
- elif 'soundcloud.com' in domain(link['url']).lower():
|
|
|
- return 'soundcloud'
|
|
|
- elif 'youku.com' in domain(link['url']).lower():
|
|
|
- return 'youku'
|
|
|
- elif 'vimeo.com' in domain(link['url']).lower():
|
|
|
- return 'vimeo'
|
|
|
- return None
|
|
|
-
|
|
|
-def merge_links(a, b):
|
|
|
- """deterministially merge two links, favoring longer field values over shorter,
|
|
|
- and "cleaner" values over worse ones.
|
|
|
- """
|
|
|
- longer = lambda key: (a[key] if len(a[key]) > len(b[key]) else b[key]) if (a[key] and b[key]) else (a[key] or b[key])
|
|
|
- earlier = lambda key: a[key] if a[key] < b[key] else b[key]
|
|
|
-
|
|
|
- url = longer('url')
|
|
|
- longest_title = longer('title')
|
|
|
- cleanest_title = a['title'] if '://' not in (a['title'] or '') else b['title']
|
|
|
- link = {
|
|
|
- 'timestamp': earlier('timestamp'),
|
|
|
- 'url': url,
|
|
|
- 'domain': domain(url),
|
|
|
- 'base_url': base_url(url),
|
|
|
- 'tags': longer('tags'),
|
|
|
- 'title': longest_title if '://' not in (longest_title or '') else cleanest_title,
|
|
|
- 'sources': list(set(a.get('sources', []) + b.get('sources', []))),
|
|
|
- }
|
|
|
- link['type'] = get_link_type(link)
|
|
|
- return link
|
|
|
-
|
|
|
-def find_link(folder, links):
|
|
|
- """for a given archive folder, find the corresponding link object in links"""
|
|
|
- url = parse_url(folder)
|
|
|
- if url:
|
|
|
- for link in links:
|
|
|
- if (base_url(link['url']) in url) or (url in link['url']):
|
|
|
- return link
|
|
|
-
|
|
|
- timestamp = folder.split('.')[0]
|
|
|
- for link in links:
|
|
|
- if link['timestamp'].startswith(timestamp):
|
|
|
- if domain(link['url']) in os.listdir(os.path.join(ARCHIVE_DIR, folder)):
|
|
|
- return link # careful now, this isn't safe for most ppl
|
|
|
- if domain(link['url']) in parse_url(folder):
|
|
|
- return link
|
|
|
- return None
|
|
|
-
|
|
|
-
|
|
|
-def parse_url(folder):
|
|
|
- """for a given archive folder, figure out what url it's for"""
|
|
|
- link_json = os.path.join(ARCHIVE_DIR, folder, 'index.json')
|
|
|
- if os.path.exists(link_json):
|
|
|
- with open(link_json, 'r') as f:
|
|
|
- try:
|
|
|
- link_json = f.read().strip()
|
|
|
- if link_json:
|
|
|
- link = json.loads(link_json)
|
|
|
- return base_url(link['url'])
|
|
|
- except ValueError:
|
|
|
- print('File contains invalid JSON: {}!'.format(link_json))
|
|
|
-
|
|
|
- archive_org_txt = os.path.join(ARCHIVE_DIR, folder, 'archive.org.txt')
|
|
|
- if os.path.exists(archive_org_txt):
|
|
|
- with open(archive_org_txt, 'r') as f:
|
|
|
- original_link = f.read().strip().split('/http', 1)[-1]
|
|
|
- with_scheme = 'http{}'.format(original_link)
|
|
|
- return with_scheme
|
|
|
-
|
|
|
- return ''
|
|
|
-
|
|
|
-def manually_merge_folders(source, target):
|
|
|
- """prompt for user input to resolve a conflict between two archive folders"""
|
|
|
-
|
|
|
- if not IS_TTY:
|
|
|
- return
|
|
|
-
|
|
|
- fname = lambda path: path.split('/')[-1]
|
|
|
-
|
|
|
- print(' {} and {} have conflicting files, which do you want to keep?'.format(fname(source), fname(target)))
|
|
|
- print(' - [enter]: do nothing (keep both)')
|
|
|
- print(' - a: prefer files from {}'.format(source))
|
|
|
- print(' - b: prefer files from {}'.format(target))
|
|
|
- print(' - q: quit and resolve the conflict manually')
|
|
|
- try:
|
|
|
- answer = input('> ').strip().lower()
|
|
|
- except KeyboardInterrupt:
|
|
|
- answer = 'q'
|
|
|
-
|
|
|
- assert answer in ('', 'a', 'b', 'q'), 'Invalid choice.'
|
|
|
-
|
|
|
- if answer == 'q':
|
|
|
- print('\nJust run ArchiveBox again to pick up where you left off.')
|
|
|
- raise SystemExit(0)
|
|
|
- elif answer == '':
|
|
|
- return
|
|
|
-
|
|
|
- files_in_source = set(os.listdir(source))
|
|
|
- files_in_target = set(os.listdir(target))
|
|
|
- for file in files_in_source:
|
|
|
- if file in files_in_target:
|
|
|
- to_delete = target if answer == 'a' else source
|
|
|
- run(['rm', '-Rf', os.path.join(to_delete, file)])
|
|
|
- run(['mv', os.path.join(source, file), os.path.join(target, file)])
|
|
|
-
|
|
|
- if not set(os.listdir(source)):
|
|
|
- run(['rm', '-Rf', source])
|
|
|
-
|
|
|
-def fix_folder_path(archive_path, link_folder, link):
|
|
|
- """given a folder, merge it to the canonical 'correct' path for the given link object"""
|
|
|
- source = os.path.join(archive_path, link_folder)
|
|
|
- target = os.path.join(archive_path, link['timestamp'])
|
|
|
-
|
|
|
- url_in_folder = parse_url(source)
|
|
|
- if not (url_in_folder in base_url(link['url'])
|
|
|
- or base_url(link['url']) in url_in_folder):
|
|
|
- raise ValueError('The link does not match the url for this folder.')
|
|
|
-
|
|
|
- if not os.path.exists(target):
|
|
|
- # target doesn't exist so nothing needs merging, simply move A to B
|
|
|
- run(['mv', source, target])
|
|
|
- else:
|
|
|
- # target folder exists, check for conflicting files and attempt manual merge
|
|
|
- files_in_source = set(os.listdir(source))
|
|
|
- files_in_target = set(os.listdir(target))
|
|
|
- conflicting_files = files_in_source & files_in_target
|
|
|
-
|
|
|
- if not conflicting_files:
|
|
|
- for file in files_in_source:
|
|
|
- run(['mv', os.path.join(source, file), os.path.join(target, file)])
|
|
|
-
|
|
|
- if os.path.exists(source):
|
|
|
- files_in_source = set(os.listdir(source))
|
|
|
- if files_in_source:
|
|
|
- manually_merge_folders(source, target)
|
|
|
- else:
|
|
|
- run(['rm', '-R', source])
|
|
|
-
|
|
|
-
|
|
|
-def migrate_data():
|
|
|
- # migrate old folder to new OUTPUT folder
|
|
|
- old_dir = os.path.join(REPO_DIR, 'html')
|
|
|
- if os.path.exists(old_dir):
|
|
|
- print('[!] WARNING: Moved old output folder "html" to new location: {}'.format(OUTPUT_DIR))
|
|
|
- run(['mv', old_dir, OUTPUT_DIR], timeout=10)
|
|
|
-
|
|
|
-
|
|
|
-def cleanup_archive(archive_path, links):
|
|
|
- """move any incorrectly named folders to their canonical locations"""
|
|
|
-
|
|
|
- # for each folder that exists, see if we can match it up with a known good link
|
|
|
- # if we can, then merge the two folders (TODO: if not, move it to lost & found)
|
|
|
-
|
|
|
- unmatched = []
|
|
|
- bad_folders = []
|
|
|
-
|
|
|
- if not os.path.exists(archive_path):
|
|
|
- return
|
|
|
-
|
|
|
- for folder in os.listdir(archive_path):
|
|
|
- try:
|
|
|
- files = os.listdir(os.path.join(archive_path, folder))
|
|
|
- except NotADirectoryError:
|
|
|
- continue
|
|
|
-
|
|
|
- if files:
|
|
|
- link = find_link(folder, links)
|
|
|
- if link is None:
|
|
|
- unmatched.append(folder)
|
|
|
- continue
|
|
|
-
|
|
|
- if folder != link['timestamp']:
|
|
|
- bad_folders.append((folder, link))
|
|
|
- else:
|
|
|
- # delete empty folders
|
|
|
- run(['rm', '-R', os.path.join(archive_path, folder)])
|
|
|
-
|
|
|
- if bad_folders and IS_TTY and input('[!] Cleanup archive? y/[n]: ') == 'y':
|
|
|
- print('[!] Fixing {} improperly named folders in archive...'.format(len(bad_folders)))
|
|
|
- for folder, link in bad_folders:
|
|
|
- fix_folder_path(archive_path, folder, link)
|
|
|
- elif bad_folders:
|
|
|
- print('[!] Warning! {} folders need to be merged, fix by running ArchiveBox.'.format(len(bad_folders)))
|
|
|
-
|
|
|
- if unmatched:
|
|
|
- print('[!] Warning! {} unrecognized folders in html/archive/'.format(len(unmatched)))
|
|
|
- print(' '+ '\n '.join(unmatched))
|
|
|
-
|
|
|
-
|
|
|
def wget_output_path(link, look_in=None):
|
|
|
"""calculate the path to the wgetted .html file, since wget may
|
|
|
adjust some paths to be different than the base_url path.
|
|
|
@@ -615,6 +297,88 @@ def wget_output_path(link, look_in=None):
|
|
|
# return urlencode('#'.join([without_scheme + '.html', *split_url[1:]]))
|
|
|
# return urlencode(base_url(link['url']) + '/index.html')
|
|
|
|
|
|
+### String Manipulation & Logging Helpers
|
|
|
+
|
|
|
+def str_between(string, start, end=None):
|
|
|
+ """(<abc>12345</def>, <abc>, </def>) -> 12345"""
|
|
|
+
|
|
|
+ content = string.split(start, 1)[-1]
|
|
|
+ if end is not None:
|
|
|
+ content = content.rsplit(end, 1)[0]
|
|
|
+
|
|
|
+ return content
|
|
|
+
|
|
|
+def pretty_path(path):
|
|
|
+ """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
|
|
|
+ return path.replace(REPO_DIR + '/', '')
|
|
|
+
|
|
|
+def print_error_hints(cmd, pwd, err=None, hints=None, prefix=' '):
|
|
|
+ """quote the argument with whitespace in a command so the user can
|
|
|
+ copy-paste the outputted string directly to run the cmd
|
|
|
+ """
|
|
|
+
|
|
|
+ quoted_cmd = ' '.join(
|
|
|
+ '"{}"'.format(arg) if ' ' in arg else arg
|
|
|
+ for arg in cmd
|
|
|
+ )
|
|
|
+
|
|
|
+ output_lines = [
|
|
|
+ '{}Failed: {} {}{}'.format(ANSI['red'], err.__class__.__name__, err, ANSI['reset']),
|
|
|
+ ' {}{}{}'.format(ANSI['lightyellow'], hints, ANSI['reset']) if hints else None,
|
|
|
+ 'Run to see full output:'
|
|
|
+ ' cd {};'.format(pwd),
|
|
|
+ ' {}'.format(quoted_cmd),
|
|
|
+ ]
|
|
|
+
|
|
|
+ return '\n'.join(
|
|
|
+ '{}{}'.format(prefix, line)
|
|
|
+ for line in output_lines
|
|
|
+ if line
|
|
|
+ )
|
|
|
+
|
|
|
+
|
|
|
+### Link Helpers
|
|
|
+
|
|
|
+def merge_links(a, b):
|
|
|
+ """deterministially merge two links, favoring longer field values over shorter,
|
|
|
+ and "cleaner" values over worse ones.
|
|
|
+ """
|
|
|
+ longer = lambda key: (a[key] if len(a[key]) > len(b[key]) else b[key]) if (a[key] and b[key]) else (a[key] or b[key])
|
|
|
+ earlier = lambda key: a[key] if a[key] < b[key] else b[key]
|
|
|
+
|
|
|
+ url = longer('url')
|
|
|
+ longest_title = longer('title')
|
|
|
+ cleanest_title = a['title'] if '://' not in (a['title'] or '') else b['title']
|
|
|
+ link = {
|
|
|
+ 'timestamp': earlier('timestamp'),
|
|
|
+ 'url': url,
|
|
|
+ 'domain': domain(url),
|
|
|
+ 'base_url': base_url(url),
|
|
|
+ 'tags': longer('tags'),
|
|
|
+ 'title': longest_title if '://' not in (longest_title or '') else cleanest_title,
|
|
|
+ 'sources': list(set(a.get('sources', []) + b.get('sources', []))),
|
|
|
+ }
|
|
|
+ link['type'] = get_link_type(link)
|
|
|
+ return link
|
|
|
+
|
|
|
+def get_link_type(link):
|
|
|
+ """Certain types of links need to be handled specially, this figures out when that's the case"""
|
|
|
+
|
|
|
+ if extension(link['url']) == 'pdf':
|
|
|
+ return 'PDF'
|
|
|
+ elif extension(link['url']) in ('pdf', 'png', 'jpg', 'jpeg', 'svg', 'bmp', 'gif', 'tiff', 'webp'):
|
|
|
+ return 'image'
|
|
|
+ elif 'wikipedia.org' in domain(link['url']).lower():
|
|
|
+ return 'wiki'
|
|
|
+ elif 'youtube.com' in domain(link['url']).lower():
|
|
|
+ return 'youtube'
|
|
|
+ elif 'soundcloud.com' in domain(link['url']).lower():
|
|
|
+ return 'soundcloud'
|
|
|
+ elif 'youku.com' in domain(link['url']).lower():
|
|
|
+ return 'youku'
|
|
|
+ elif 'vimeo.com' in domain(link['url']).lower():
|
|
|
+ return 'vimeo'
|
|
|
+ return None
|
|
|
|
|
|
def derived_link_info(link):
|
|
|
"""extend link info with the archive urls and other derived data"""
|
|
|
@@ -625,15 +389,12 @@ def derived_link_info(link):
|
|
|
|
|
|
extended_info = {
|
|
|
**link,
|
|
|
- 'title': link['title'] or base_url(url),
|
|
|
- 'date': to_date_str(link['timestamp']),
|
|
|
+ 'bookmarked_date': to_date_str(link['timestamp']),
|
|
|
'updated_date': to_date_str(link['updated']) if 'updated' in link else None,
|
|
|
- 'base_url': base_url(url),
|
|
|
'domain': domain(url),
|
|
|
- 'basename': basename(url),
|
|
|
'path': path(url),
|
|
|
- 'type': link['type'] or 'website',
|
|
|
- 'tags': link['tags'] or 'untagged',
|
|
|
+ 'basename': basename(url),
|
|
|
+ 'base_url': base_url(url),
|
|
|
}
|
|
|
|
|
|
# Archive Method Output URLs
|
|
|
@@ -667,6 +428,96 @@ def derived_link_info(link):
|
|
|
return extended_info
|
|
|
|
|
|
|
|
|
+### Python / System Helpers
|
|
|
+
|
|
|
+def progress(seconds=TIMEOUT, prefix=''):
|
|
|
+ """Show a (subprocess-controlled) progress bar with a <seconds> timeout,
|
|
|
+ returns end() function to instantly finish the progress
|
|
|
+ """
|
|
|
+
|
|
|
+ if not SHOW_PROGRESS:
|
|
|
+ return lambda: None
|
|
|
+
|
|
|
+ def progress_bar(seconds, prefix):
|
|
|
+ """show timer in the form of progress bar, with percentage and seconds remaining"""
|
|
|
+ chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#'
|
|
|
+ chunks = TERM_WIDTH - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
|
|
|
+ try:
|
|
|
+ for s in range(seconds * chunks):
|
|
|
+ progress = s / chunks / seconds * 100
|
|
|
+ bar_width = round(progress/(100/chunks))
|
|
|
+
|
|
|
+ # ████████████████████ 0.9% (1/60sec)
|
|
|
+ sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
|
|
|
+ prefix,
|
|
|
+ ANSI['green'],
|
|
|
+ (chunk * bar_width).ljust(chunks),
|
|
|
+ ANSI['reset'],
|
|
|
+ round(progress, 1),
|
|
|
+ round(s/chunks),
|
|
|
+ seconds,
|
|
|
+ ))
|
|
|
+ sys.stdout.flush()
|
|
|
+ time.sleep(1 / chunks)
|
|
|
+
|
|
|
+ # ██████████████████████████████████ 100.0% (60/60sec)
|
|
|
+ sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format(
|
|
|
+ prefix,
|
|
|
+ ANSI['red'],
|
|
|
+ chunk * chunks,
|
|
|
+ ANSI['reset'],
|
|
|
+ 100.0,
|
|
|
+ seconds,
|
|
|
+ seconds,
|
|
|
+ ))
|
|
|
+ sys.stdout.flush()
|
|
|
+ except KeyboardInterrupt:
|
|
|
+ print()
|
|
|
+ pass
|
|
|
+
|
|
|
+ p = Process(target=progress_bar, args=(seconds, prefix))
|
|
|
+ p.start()
|
|
|
+
|
|
|
+ def end():
|
|
|
+ """immediately finish progress and clear the progressbar line"""
|
|
|
+
|
|
|
+ # protect from double termination
|
|
|
+ #if p is None or not hasattr(p, 'kill'):
|
|
|
+ # return
|
|
|
+ nonlocal p
|
|
|
+ if p is not None:
|
|
|
+ p.terminate()
|
|
|
+ p = None
|
|
|
+
|
|
|
+ sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH), ANSI['reset'])) # clear whole terminal line
|
|
|
+ sys.stdout.flush()
|
|
|
+
|
|
|
+ return end
|
|
|
+
|
|
|
+def download_url(url, timeout=TIMEOUT):
|
|
|
+ req = Request(url, headers={'User-Agent': WGET_USER_AGENT})
|
|
|
+
|
|
|
+ if CHECK_SSL_VALIDITY:
|
|
|
+ resp = urlopen(req, timeout=timeout)
|
|
|
+ else:
|
|
|
+ import ssl
|
|
|
+ insecure = ssl._create_unverified_context()
|
|
|
+ resp = urlopen(req, timeout=timeout, context=insecure)
|
|
|
+
|
|
|
+ encoding = resp.headers.get_content_charset() or 'utf-8'
|
|
|
+ return resp.read().decode(encoding)
|
|
|
+
|
|
|
+def chmod_file(path, cwd='.', permissions=OUTPUT_PERMISSIONS, timeout=30):
|
|
|
+ """chmod -R <permissions> <cwd>/<path>"""
|
|
|
+
|
|
|
+ if not os.path.exists(os.path.join(cwd, path)):
|
|
|
+ raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
|
|
|
+
|
|
|
+ chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout)
|
|
|
+ if chmod_result.returncode == 1:
|
|
|
+ print(' ', chmod_result.stderr.decode())
|
|
|
+ raise Exception('Failed to chmod {}/{}'.format(cwd, path))
|
|
|
+
|
|
|
def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
|
|
|
"""Patched of subprocess.run to fix blocking io making timeout=innefective"""
|
|
|
|
|
|
@@ -701,16 +552,3 @@ def run(*popenargs, input=None, capture_output=False, timeout=None, check=False,
|
|
|
raise CalledProcessError(retcode, process.args,
|
|
|
output=stdout, stderr=stderr)
|
|
|
return CompletedProcess(process.args, retcode, stdout, stderr)
|
|
|
-
|
|
|
-
|
|
|
-def check_link_structure(link):
|
|
|
- assert isinstance(link, dict)
|
|
|
- assert isinstance(link.get('url'), str)
|
|
|
- assert len(link['url']) > 2
|
|
|
- assert len(re.findall(URL_REGEX, link['url'])) == 1
|
|
|
-
|
|
|
-
|
|
|
-def check_links_structure(links):
|
|
|
- assert isinstance(links, list)
|
|
|
- if links:
|
|
|
- check_link_structure(links[0])
|