Bläddra i källkod

fetch page title during archiving process

Nick Sweeting 6 år sedan
förälder
incheckning
5a7d00a639
5 ändrade filer med 44 tillägg och 15 borttagningar
  1. 30 2
      archivebox/archive_methods.py
  2. 1 0
      archivebox/config.py
  3. 4 1
      archivebox/links.py
  4. 8 12
      archivebox/util.py
  5. 1 0
      etc/ArchiveBox.conf.default

+ 30 - 2
archivebox/archive_methods.py

@@ -12,6 +12,8 @@ from index import wget_output_path, parse_json_link_index, write_link_index
 from links import links_after_timestamp
 from links import links_after_timestamp
 from config import (
 from config import (
     CHROME_BINARY,
     CHROME_BINARY,
+    FETCH_FAVICON,
+    FETCH_TITLE,
     FETCH_WGET,
     FETCH_WGET,
     FETCH_WGET_REQUISITES,
     FETCH_WGET_REQUISITES,
     FETCH_PDF,
     FETCH_PDF,
@@ -23,7 +25,6 @@ from config import (
     RESOLUTION,
     RESOLUTION,
     CHECK_SSL_VALIDITY,
     CHECK_SSL_VALIDITY,
     SUBMIT_ARCHIVE_DOT_ORG,
     SUBMIT_ARCHIVE_DOT_ORG,
-    FETCH_FAVICON,
     WGET_USER_AGENT,
     WGET_USER_AGENT,
     CHROME_USER_DATA_DIR,
     CHROME_USER_DATA_DIR,
     CHROME_SANDBOX,
     CHROME_SANDBOX,
@@ -36,6 +37,7 @@ from config import (
 )
 )
 from util import (
 from util import (
     check_dependencies,
     check_dependencies,
+    fetch_page_title,
     progress,
     progress,
     chmod_file,
     chmod_file,
     pretty_path,
     pretty_path,
@@ -96,6 +98,9 @@ def archive_link(link_dir, link, overwrite=True):
         if FETCH_FAVICON:
         if FETCH_FAVICON:
             link = fetch_favicon(link_dir, link, overwrite=overwrite)
             link = fetch_favicon(link_dir, link, overwrite=overwrite)
 
 
+        if FETCH_TITLE:
+            link = fetch_title(link_dir, link, overwrite=overwrite)
+
         if FETCH_WGET:
         if FETCH_WGET:
             link = fetch_wget(link_dir, link, overwrite=overwrite)
             link = fetch_wget(link_dir, link, overwrite=overwrite)
 
 
@@ -129,7 +134,7 @@ def log_link_archive(link_dir, link, update_existing):
         symbol='*' if update_existing else '+',
         symbol='*' if update_existing else '+',
         symbol_color=ANSI['black' if update_existing else 'green'],
         symbol_color=ANSI['black' if update_existing else 'green'],
         now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
         now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
-        **link,
+        **{**link, 'title': link['title'] or link['url']},
         **ANSI,
         **ANSI,
     ))
     ))
 
 
@@ -492,6 +497,29 @@ def fetch_favicon(link_dir, link, timeout=TIMEOUT):
         'output': output,
         'output': output,
     }
     }
 
 
+@attach_result_to_link('title')
+def fetch_title(link_dir, link, timeout=TIMEOUT):
+    """try to guess the page's title from its content"""
+
+    # if link already has valid title, skip it
+    if link['title'] and not link['title'].lower().startswith('http'):
+        return {'output': link['title'], 'cmd': 'fetch_page_title("{}")'.format(link['url'])}
+
+    end = progress(timeout, prefix='      ')
+    try:
+        title = fetch_page_title(link['url'], timeout=timeout, progress=False)
+        end()
+        output = title
+    except Exception as e:
+        end()
+        print('        {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
+        output = e
+
+    return {
+        'cmd': 'fetch_page_title("{}")'.format(link['url']),
+        'output': output,
+    }
+
 @attach_result_to_link('media')
 @attach_result_to_link('media')
 def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
 def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
     """Download playlists or individual video, audio, and subtitles using youtube-dl"""
     """Download playlists or individual video, audio, and subtitles using youtube-dl"""

+ 1 - 0
archivebox/config.py

@@ -27,6 +27,7 @@ FETCH_WARC =             os.getenv('FETCH_WARC',             'True'
 FETCH_GIT =              os.getenv('FETCH_GIT',              'True'             ).lower() == 'true'
 FETCH_GIT =              os.getenv('FETCH_GIT',              'True'             ).lower() == 'true'
 FETCH_MEDIA =            os.getenv('FETCH_MEDIA',            'True'             ).lower() == 'true'
 FETCH_MEDIA =            os.getenv('FETCH_MEDIA',            'True'             ).lower() == 'true'
 FETCH_FAVICON =          os.getenv('FETCH_FAVICON',          'True'             ).lower() == 'true'
 FETCH_FAVICON =          os.getenv('FETCH_FAVICON',          'True'             ).lower() == 'true'
+FETCH_TITLE =            os.getenv('FETCH_TITLE',            'True'             ).lower() == 'true'
 SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True'             ).lower() == 'true'
 SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True'             ).lower() == 'true'
 
 
 CHECK_SSL_VALIDITY =     os.getenv('CHECK_SSL_VALIDITY',     'True'             ).lower() == 'true'
 CHECK_SSL_VALIDITY =     os.getenv('CHECK_SSL_VALIDITY',     'True'             ).lower() == 'true'

+ 4 - 1
archivebox/links.py

@@ -57,7 +57,7 @@ def validate_links(links):
         raise SystemExit(1)
         raise SystemExit(1)
 
 
     for link in links:
     for link in links:
-        link['title'] = unescape(link['title'])
+        link['title'] = unescape(link['title']) if link['title'] else None
         link['latest'] = link.get('latest') or {}
         link['latest'] = link.get('latest') or {}
         
         
         latest = link['latest']
         latest = link['latest']
@@ -76,6 +76,9 @@ def validate_links(links):
         if not latest.get('favicon'):
         if not latest.get('favicon'):
             latest['favicon'] = None
             latest['favicon'] = None
 
 
+        if not link['latest'].get('title'):
+            link['latest']['title'] = link['title']
+
     return list(links)
     return list(links)
 
 
 def new_links(all_links, existing_links):
 def new_links(all_links, existing_links):

+ 8 - 12
archivebox/util.py

@@ -44,6 +44,7 @@ base_url = lambda url: without_scheme(url)  # uniq base url used to dedupe links
 short_ts = lambda ts: ts.split('.')[0]
 short_ts = lambda ts: ts.split('.')[0]
 
 
 URL_REGEX = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))[^<\""]+'
 URL_REGEX = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))[^<\""]+'
+HTML_TITLE_REGEX = '<title>(.[^<>]+)'
 
 
 
 
 def check_dependencies():
 def check_dependencies():
@@ -227,22 +228,17 @@ def download_url(url):
     return source_path
     return source_path
 
 
 
 
-def fetch_page_title(url, default=True):
+def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
     """Attempt to guess a page's title by downloading the html"""
     """Attempt to guess a page's title by downloading the html"""
-    if default is True:
-        default = url
-
     try:
     try:
-        if SHOW_PROGRESS:
+        if progress:
             sys.stdout.write('.')
             sys.stdout.write('.')
             sys.stdout.flush()
             sys.stdout.flush()
         html_content = urllib.request.urlopen(url, timeout=10).read().decode('utf-8')
         html_content = urllib.request.urlopen(url, timeout=10).read().decode('utf-8')
         match = re.search('<title>(.*?)</title>', html_content)
         match = re.search('<title>(.*?)</title>', html_content)
         return match.group(1) if match else default or None
         return match.group(1) if match else default or None
     except Exception:
     except Exception:
-        if default is False:
-            raise
-        return default
+        return None
 
 
 
 
 def str_between(string, start, end=None):
 def str_between(string, start, end=None):
@@ -277,19 +273,19 @@ def merge_links(a, b):
     """deterministially merge two links, favoring longer field values over shorter,
     """deterministially merge two links, favoring longer field values over shorter,
     and "cleaner" values over worse ones.
     and "cleaner" values over worse ones.
     """
     """
-    longer = lambda key: a[key] if len(a[key]) > len(b[key]) else b[key]
+    longer = lambda key: (a[key] if len(a[key]) > len(b[key]) else b[key]) if (a[key] and b[key]) else (a[key] or b[key])
     earlier = lambda key: a[key] if a[key] < b[key] else b[key]
     earlier = lambda key: a[key] if a[key] < b[key] else b[key]
     
     
     url = longer('url')
     url = longer('url')
     longest_title = longer('title')
     longest_title = longer('title')
-    cleanest_title = a['title'] if '://' not in a['title'] else b['title']
+    cleanest_title = a['title'] if '://' not in (a['title'] or '') else b['title']
     link = {
     link = {
         'timestamp': earlier('timestamp'),
         'timestamp': earlier('timestamp'),
         'url': url,
         'url': url,
         'domain': domain(url),
         'domain': domain(url),
         'base_url': base_url(url),
         'base_url': base_url(url),
         'tags': longer('tags'),
         'tags': longer('tags'),
-        'title': longest_title if '://' not in longest_title else cleanest_title,
+        'title': longest_title if '://' not in (longest_title or '') else cleanest_title,
         'sources': list(set(a.get('sources', []) + b.get('sources', []))),
         'sources': list(set(a.get('sources', []) + b.get('sources', []))),
     }
     }
     link['type'] = get_link_type(link)
     link['type'] = get_link_type(link)
@@ -532,7 +528,7 @@ def derived_link_info(link):
             'pdf_link': 'archive/{timestamp}/{base_url}'.format(**link),
             'pdf_link': 'archive/{timestamp}/{base_url}'.format(**link),
             'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**link),
             'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**link),
             'dom_link': 'archive/{timestamp}/{base_url}'.format(**link),
             'dom_link': 'archive/{timestamp}/{base_url}'.format(**link),
-            'title': '{title} ({type})'.format(**link),
+            'title': link['title'] or basename(link['url']),
         })
         })
     return link_info
     return link_info
 
 

+ 1 - 0
etc/ArchiveBox.conf.default

@@ -10,6 +10,7 @@
 # FETCH_MEDIA=False
 # FETCH_MEDIA=False
 # FETCH_GIT=True
 # FETCH_GIT=True
 # FETCH_FAVICON=True
 # FETCH_FAVICON=True
+# FETCH_TITLE=True
 # SUBMIT_ARCHIVE_DOT_ORG=True
 # SUBMIT_ARCHIVE_DOT_ORG=True
 
 
 ### To only download new links, and never attempt to update old ones, uncomment this line:
 ### To only download new links, and never attempt to update old ones, uncomment this line: