Browse Source

save the url as title for staticfiles or non html files

Nick Sweeting 4 years ago
parent
commit
385daf9af8
1 changed files with 5 additions and 4 deletions
  1. 5 4
      archivebox/extractors/title.py

+ 5 - 4
archivebox/extractors/title.py

@@ -62,9 +62,6 @@ class TitleParser(HTMLParser):
 
 
 @enforce_types
 @enforce_types
 def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
 def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
-    if is_static_file(link.url):
-        return False
-
     # if link already has valid title, skip it
     # if link already has valid title, skip it
     if not overwrite and link.title and not link.title.lower().startswith('http'):
     if not overwrite and link.title and not link.title.lower().startswith('http'):
         return False
         return False
@@ -113,7 +110,11 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
                                         timestamp=link.timestamp)\
                                         timestamp=link.timestamp)\
                                 .update(title=output)
                                 .update(title=output)
         else:
         else:
-            raise ArchiveError('Unable to detect page title')
+            # if no content was returned, dont save a title (because it might be a temporary error)
+            if not html:
+                raise ArchiveError('Unable to detect page title')
+            # output = html[:128]       # use first bit of content as the title
+            output = link.base_url      # use the filename as the title (better UX)
     except Exception as err:
     except Exception as err:
         status = 'failed'
         status = 'failed'
         output = err
         output = err