6 år sedan · 5a7d00a639
--- a/archivebox/archive_methods.py
+++ b/archivebox/archive_methods.py
@@ -12,6 +12,8 @@ from index import wget_output_path, parse_json_link_index, write_link_index
 
															 from links import links_after_timestamp
														
 
															 from config import (
														
 
															     CHROME_BINARY,
														
 
															+    FETCH_FAVICON,
														
 
															+    FETCH_TITLE,
														
 
															     FETCH_WGET,
														
 
															     FETCH_WGET_REQUISITES,
														
 
															     FETCH_PDF,
														
@@ -23,7 +25,6 @@ from config import (
 
															     RESOLUTION,
														
 
															     CHECK_SSL_VALIDITY,
														
 
															     SUBMIT_ARCHIVE_DOT_ORG,
														
 
															-    FETCH_FAVICON,
														
 
															     WGET_USER_AGENT,
														
 
															     CHROME_USER_DATA_DIR,
														
 
															     CHROME_SANDBOX,
														
@@ -36,6 +37,7 @@ from config import (
 
															 )
														
 
															 from util import (
														
 
															     check_dependencies,
														
 
															+    fetch_page_title,
														
 
															     progress,
														
 
															     chmod_file,
														
 
															     pretty_path,
														
@@ -96,6 +98,9 @@ def archive_link(link_dir, link, overwrite=True):
 
															         if FETCH_FAVICON:
														
 
															             link = fetch_favicon(link_dir, link, overwrite=overwrite)
														
 
															+        if FETCH_TITLE:
														
 
															+            link = fetch_title(link_dir, link, overwrite=overwrite)
														
 
															+
														
 
															         if FETCH_WGET:
														
 
															             link = fetch_wget(link_dir, link, overwrite=overwrite)
														
@@ -129,7 +134,7 @@ def log_link_archive(link_dir, link, update_existing):
 
															         symbol='*' if update_existing else '+',
														
 
															         symbol_color=ANSI['black' if update_existing else 'green'],
														
 
															         now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
														
 
															-        **link,
														
 
															+        **{**link, 'title': link['title'] or link['url']},
														
 
															         **ANSI,
														
 
															     ))
														
@@ -492,6 +497,29 @@ def fetch_favicon(link_dir, link, timeout=TIMEOUT):
 
															         'output': output,
														
 
															     }
														
 
															+@attach_result_to_link('title')
														
 
															+def fetch_title(link_dir, link, timeout=TIMEOUT):
														
 
															+    """try to guess the page's title from its content"""
														
 
															+
														
 
															+    # if link already has valid title, skip it
														
 
															+    if link['title'] and not link['title'].lower().startswith('http'):
														
 
															+        return {'output': link['title'], 'cmd': 'fetch_page_title("{}")'.format(link['url'])}
														
 
															+
														
 
															+    end = progress(timeout, prefix='      ')
														
 
															+    try:
														
 
															+        title = fetch_page_title(link['url'], timeout=timeout, progress=False)
														
 
															+        end()
														
 
															+        output = title
														
 
															+    except Exception as e:
														
 
															+        end()
														
 
															+        print('        {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
														
 
															+        output = e
														
 
															+
														
 
															+    return {
														
 
															+        'cmd': 'fetch_page_title("{}")'.format(link['url']),
														
 
															+        'output': output,
														
 
															+    }
														
 
															+
														
 
															 @attach_result_to_link('media')
														
 
															 def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
														
 
															     """Download playlists or individual video, audio, and subtitles using youtube-dl"""
														
--- a/archivebox/config.py
+++ b/archivebox/config.py
@@ -27,6 +27,7 @@ FETCH_WARC =             os.getenv('FETCH_WARC',             'True'
 
															 FETCH_GIT =              os.getenv('FETCH_GIT',              'True'             ).lower() == 'true'
														
 
															 FETCH_MEDIA =            os.getenv('FETCH_MEDIA',            'True'             ).lower() == 'true'
														
 
															 FETCH_FAVICON =          os.getenv('FETCH_FAVICON',          'True'             ).lower() == 'true'
														
 
															+FETCH_TITLE =            os.getenv('FETCH_TITLE',            'True'             ).lower() == 'true'
														
 
															 SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True'             ).lower() == 'true'
														
 
															 CHECK_SSL_VALIDITY =     os.getenv('CHECK_SSL_VALIDITY',     'True'             ).lower() == 'true'
														
--- a/archivebox/links.py
+++ b/archivebox/links.py
@@ -57,7 +57,7 @@ def validate_links(links):
 
															         raise SystemExit(1)
														
 
															     for link in links:
														
 
															-        link['title'] = unescape(link['title'])
														
 
															+        link['title'] = unescape(link['title']) if link['title'] else None
														
 
															         link['latest'] = link.get('latest') or {}
														
 
															         latest = link['latest']
														
@@ -76,6 +76,9 @@ def validate_links(links):
 
															         if not latest.get('favicon'):
														
 
															             latest['favicon'] = None
														
 
															+        if not link['latest'].get('title'):
														
 
															+            link['latest']['title'] = link['title']
														
 
															+
														
 
															     return list(links)
														
 
															 def new_links(all_links, existing_links):
														
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -44,6 +44,7 @@ base_url = lambda url: without_scheme(url)  # uniq base url used to dedupe links
 
															 short_ts = lambda ts: ts.split('.')[0]
														
 
															 URL_REGEX = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))[^<\""]+'
														
 
															+HTML_TITLE_REGEX = '<title>(.[^<>]+)'
														
 
															 def check_dependencies():
														
@@ -227,22 +228,17 @@ def download_url(url):
 
															     return source_path
														
 
															-def fetch_page_title(url, default=True):
														
 
															+def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
														
 
															     """Attempt to guess a page's title by downloading the html"""
														
 
															-    if default is True:
														
 
															-        default = url
														
 
															-
														
 
															     try:
														
 
															-        if SHOW_PROGRESS:
														
 
															+        if progress:
														
 
															             sys.stdout.write('.')
														
 
															             sys.stdout.flush()
														
 
															         html_content = urllib.request.urlopen(url, timeout=10).read().decode('utf-8')
														
 
															         match = re.search('<title>(.*?)</title>', html_content)
														
 
															         return match.group(1) if match else default or None
														
 
															     except Exception:
														
 
															-        if default is False:
														
 
															-            raise
														
 
															-        return default
														
 
															+        return None
														
 
															 def str_between(string, start, end=None):
														
@@ -277,19 +273,19 @@ def merge_links(a, b):
 
															     """deterministially merge two links, favoring longer field values over shorter,
														
 
															     and "cleaner" values over worse ones.
														
 
															     """
														
 
															-    longer = lambda key: a[key] if len(a[key]) > len(b[key]) else b[key]
														
 
															+    longer = lambda key: (a[key] if len(a[key]) > len(b[key]) else b[key]) if (a[key] and b[key]) else (a[key] or b[key])
														
 
															     earlier = lambda key: a[key] if a[key] < b[key] else b[key]
														
 
															     url = longer('url')
														
 
															     longest_title = longer('title')
														
 
															-    cleanest_title = a['title'] if '://' not in a['title'] else b['title']
														
 
															+    cleanest_title = a['title'] if '://' not in (a['title'] or '') else b['title']
														
 
															     link = {
														
 
															         'timestamp': earlier('timestamp'),
														
 
															         'url': url,
														
 
															         'domain': domain(url),
														
 
															         'base_url': base_url(url),
														
 
															         'tags': longer('tags'),
														
 
															-        'title': longest_title if '://' not in longest_title else cleanest_title,
														
 
															+        'title': longest_title if '://' not in (longest_title or '') else cleanest_title,
														
 
															         'sources': list(set(a.get('sources', []) + b.get('sources', []))),
														
 
															     }
														
 
															     link['type'] = get_link_type(link)
														
@@ -532,7 +528,7 @@ def derived_link_info(link):
 
															             'pdf_link': 'archive/{timestamp}/{base_url}'.format(**link),
														
 
															             'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**link),
														
 
															             'dom_link': 'archive/{timestamp}/{base_url}'.format(**link),
														
 
															-            'title': '{title} ({type})'.format(**link),
														
 
															+            'title': link['title'] or basename(link['url']),
														
 
															         })
														
 
															     return link_info
														
--- a/etc/ArchiveBox.conf.default
+++ b/etc/ArchiveBox.conf.default
@@ -10,6 +10,7 @@
 
															 # FETCH_MEDIA=False
														
 
															 # FETCH_GIT=True
														
 
															 # FETCH_FAVICON=True
														
 
															+# FETCH_TITLE=True
														
 
															 # SUBMIT_ARCHIVE_DOT_ORG=True
														
 
															 ### To only download new links, and never attempt to update old ones, uncomment this line: