Browse Source

used derived info for all derivable info

Nick Sweeting 6 years ago
parent
commit
fa6f53f2af
1 changed files with 39 additions and 36 deletions
  1. 39 36
      archivebox/util.py

+ 39 - 36
archivebox/util.py

@@ -340,19 +340,19 @@ def str_between(string, start, end=None):
 def get_link_type(link):
 def get_link_type(link):
     """Certain types of links need to be handled specially, this figures out when that's the case"""
     """Certain types of links need to be handled specially, this figures out when that's the case"""
 
 
-    if link['base_url'].endswith('.pdf'):
+    if extension(link['url']) == 'pdf':
         return 'PDF'
         return 'PDF'
-    elif link['base_url'].rsplit('.', 1) in ('pdf', 'png', 'jpg', 'jpeg', 'svg', 'bmp', 'gif', 'tiff', 'webp'):
+    elif extension(link['url']) in ('pdf', 'png', 'jpg', 'jpeg', 'svg', 'bmp', 'gif', 'tiff', 'webp'):
         return 'image'
         return 'image'
-    elif 'wikipedia.org' in link['domain']:
+    elif 'wikipedia.org' in domain(link['url']).lower():
         return 'wiki'
         return 'wiki'
-    elif 'youtube.com' in link['domain']:
+    elif 'youtube.com' in domain(link['url']).lower():
         return 'youtube'
         return 'youtube'
-    elif 'soundcloud.com' in link['domain']:
+    elif 'soundcloud.com' in domain(link['url']).lower():
         return 'soundcloud'
         return 'soundcloud'
-    elif 'youku.com' in link['domain']:
+    elif 'youku.com' in domain(link['url']).lower():
         return 'youku'
         return 'youku'
-    elif 'vimeo.com' in link['domain']:
+    elif 'vimeo.com' in domain(link['url']).lower():
         return 'vimeo'
         return 'vimeo'
     return None
     return None
 
 
@@ -383,15 +383,15 @@ def find_link(folder, links):
     url = parse_url(folder)
     url = parse_url(folder)
     if url:
     if url:
         for link in links:
         for link in links:
-            if (link['base_url'] in url) or (url in link['url']):
+            if (base_url(link['url']) in url) or (url in link['url']):
                 return link
                 return link
 
 
     timestamp = folder.split('.')[0]
     timestamp = folder.split('.')[0]
     for link in links:
     for link in links:
         if link['timestamp'].startswith(timestamp):
         if link['timestamp'].startswith(timestamp):
-            if link['domain'] in os.listdir(os.path.join(ARCHIVE_DIR, folder)):
+            if domain(link['url']) in os.listdir(os.path.join(ARCHIVE_DIR, folder)):
                 return link      # careful now, this isn't safe for most ppl
                 return link      # careful now, this isn't safe for most ppl
-            if link['domain'] in parse_url(folder):
+            if domain(link['url']) in parse_url(folder):
                 return link
                 return link
     return None
     return None
 
 
@@ -405,7 +405,7 @@ def parse_url(folder):
                 link_json = f.read().strip()
                 link_json = f.read().strip()
                 if link_json:
                 if link_json:
                     link = json.loads(link_json)
                     link = json.loads(link_json)
-                    return link['base_url']
+                    return base_url(link['url'])
             except ValueError:
             except ValueError:
                 print('File contains invalid JSON: {}!'.format(link_json))
                 print('File contains invalid JSON: {}!'.format(link_json))
 
 
@@ -461,8 +461,8 @@ def fix_folder_path(archive_path, link_folder, link):
     target = os.path.join(archive_path, link['timestamp'])
     target = os.path.join(archive_path, link['timestamp'])
 
 
     url_in_folder = parse_url(source)
     url_in_folder = parse_url(source)
-    if not (url_in_folder in link['base_url']
-            or link['base_url'] in url_in_folder):
+    if not (url_in_folder in base_url(link['url'])
+            or base_url(link['url']) in url_in_folder):
         raise ValueError('The link does not match the url for this folder.')
         raise ValueError('The link does not match the url for this folder.')
 
 
     if not os.path.exists(target):
     if not os.path.exists(target):
@@ -550,12 +550,12 @@ def wget_output_path(link, look_in=None):
     urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
     urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
 
 
     if link['type'] in ('PDF', 'image'):
     if link['type'] in ('PDF', 'image'):
-        return urlencode(link['base_url'])
+        return urlencode(base_url(link['url']))
 
 
     # Since the wget algorithm to for -E (appending .html) is incredibly complex
     # Since the wget algorithm to for -E (appending .html) is incredibly complex
     # instead of trying to emulate it here, we just look in the output folder
     # instead of trying to emulate it here, we just look in the output folder
     # to see what html file wget actually created as the output
     # to see what html file wget actually created as the output
-    wget_folder = link['base_url'].rsplit('/', 1)[0].split('/')
+    wget_folder = base_url(link['url']).rsplit('/', 1)[0].split('/')
     look_in = os.path.join(ARCHIVE_DIR, link['timestamp'], *wget_folder)
     look_in = os.path.join(ARCHIVE_DIR, link['timestamp'], *wget_folder)
 
 
     if look_in and os.path.exists(look_in):
     if look_in and os.path.exists(look_in):
@@ -575,7 +575,7 @@ def wget_output_path(link, look_in=None):
 
 
     # if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
     # if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
     #     # already ends in .html
     #     # already ends in .html
-    #     return urlencode(link['base_url'])
+    #     return urlencode(base_url(link['url']))
     # else:
     # else:
     #     # .html needs to be appended
     #     # .html needs to be appended
     #     without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
     #     without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
@@ -588,7 +588,7 @@ def wget_output_path(link, look_in=None):
     #             return urlencode('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]]))
     #             return urlencode('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]]))
     #         elif '/' in without_scheme:
     #         elif '/' in without_scheme:
     #             return urlencode('#'.join([without_scheme + '.html', *split_url[1:]]))
     #             return urlencode('#'.join([without_scheme + '.html', *split_url[1:]]))
-    #         return urlencode(link['base_url'] + '/index.html')
+    #         return urlencode(base_url(link['url']) + '/index.html')
 
 
 
 
 def derived_link_info(link):
 def derived_link_info(link):
@@ -596,42 +596,45 @@ def derived_link_info(link):
 
 
     url = link['url']
     url = link['url']
 
 
-    link_info = {
+    extended_info = {
         **link,
         **link,
-        'title': link['title'] or url,
+        'title': link['title'] or base_url(url),
         'date': datetime.fromtimestamp(Decimal(link['timestamp'])).strftime('%Y-%m-%d %H:%M'),
         'date': datetime.fromtimestamp(Decimal(link['timestamp'])).strftime('%Y-%m-%d %H:%M'),
         'base_url': base_url(url),
         'base_url': base_url(url),
         'domain': domain(url),
         'domain': domain(url),
         'basename': basename(url),
         'basename': basename(url),
         'path': path(url),
         'path': path(url),
+    }
 
 
-        # Archive Method Output URLs
-        'favicon_url': 'archive/{timestamp}/favicon.ico'.format(**link),
-        'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**link),
-        'files_url': 'archive/{timestamp}/index.html'.format(**link),
+    # Archive Method Output URLs
+    extended_info = {
+        **extended_info,
+        'favicon_url': 'archive/{timestamp}/favicon.ico'.format(**extended_info),
+        'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**extended_info),
+        'files_url': 'archive/{timestamp}/index.html'.format(**extended_info),
         'archive_url': 'archive/{}/{}'.format(link['timestamp'], wget_output_path(link) or 'index.html'),
         'archive_url': 'archive/{}/{}'.format(link['timestamp'], wget_output_path(link) or 'index.html'),
-        'warc_url': 'archive/{timestamp}/warc'.format(**link),
-        'pdf_link': 'archive/{timestamp}/output.pdf'.format(**link),
-        'screenshot_link': 'archive/{timestamp}/screenshot.png'.format(**link),
-        'dom_link': 'archive/{timestamp}/output.html'.format(**link),
-        'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**link),
-        'git_url': 'archive/{timestamp}/git'.format(**link),
-        'media_url': 'archive/{timestamp}/media'.format(**link),
+        'warc_url': 'archive/{timestamp}/warc'.format(**extended_info),
+        'pdf_link': 'archive/{timestamp}/output.pdf'.format(**extended_info),
+        'screenshot_link': 'archive/{timestamp}/screenshot.png'.format(**extended_info),
+        'dom_link': 'archive/{timestamp}/output.html'.format(**extended_info),
+        'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**extended_info),
+        'git_url': 'archive/{timestamp}/git'.format(**extended_info),
+        'media_url': 'archive/{timestamp}/media'.format(**extended_info),
         
         
     }
     }
 
 
     # PDF and images are handled slightly differently
     # PDF and images are handled slightly differently
     # wget, screenshot, & pdf urls all point to the same file
     # wget, screenshot, & pdf urls all point to the same file
     if link['type'] in ('PDF', 'image'):
     if link['type'] in ('PDF', 'image'):
-        link_info.update({
-            'archive_url': 'archive/{timestamp}/{base_url}'.format(**link),
-            'pdf_link': 'archive/{timestamp}/{base_url}'.format(**link),
-            'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**link),
-            'dom_link': 'archive/{timestamp}/{base_url}'.format(**link),
+        extended_info.update({
+            'archive_url': 'archive/{timestamp}/{base_url}'.format(**extended_info),
+            'pdf_link': 'archive/{timestamp}/{base_url}'.format(**extended_info),
+            'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**extended_info),
+            'dom_link': 'archive/{timestamp}/{base_url}'.format(**extended_info),
             'title': link['title'] or basename(link['url']),
             'title': link['title'] or basename(link['url']),
         })
         })
 
 
-    return link_info
+    return extended_info
 
 
 
 
 def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
 def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):