Browse Source

fix RSS parser bailing out when lines have whitespace before tags

Nick Sweeting 6 years ago
parent
commit
eff0100971
2 changed files with 2 additions and 3 deletions
  1. 1 1
      archivebox/archive.py
  2. 1 2
      archivebox/parse.py

+ 1 - 1
archivebox/archive.py

@@ -59,7 +59,6 @@ def load_links(archive_path=OUTPUT_DIR, import_path=None):
     existing_links = []
     existing_links = []
     if archive_path:
     if archive_path:
         existing_links = parse_json_links_index(archive_path)
         existing_links = parse_json_links_index(archive_path)
-        existing_links = validate_links(existing_links)
 
 
     new_links = []
     new_links = []
     if import_path:
     if import_path:
@@ -178,6 +177,7 @@ if __name__ == '__main__':
     elif stdin_raw_text:
     elif stdin_raw_text:
         source = save_source(stdin_raw_text)
         source = save_source(stdin_raw_text)
 
 
+
     # Step 1: Parse the links and dedupe them with existing archive
     # Step 1: Parse the links and dedupe them with existing archive
     all_links, new_links = load_links(archive_path=out_dir, import_path=source)
     all_links, new_links = load_links(archive_path=out_dir, import_path=source)
 
 

+ 1 - 2
archivebox/parse.py

@@ -161,7 +161,7 @@ def parse_rss_export(rss_file):
         rows = leading_removed.split('\n')
         rows = leading_removed.split('\n')
 
 
         def get_row(key):
         def get_row(key):
-            return [r for r in rows if r.startswith('<{}>'.format(key))][0]
+            return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
 
 
         title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
         title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
         url = str_between(get_row('link'), '<link>', '</link>')
         url = str_between(get_row('link'), '<link>', '</link>')
@@ -209,7 +209,6 @@ def parse_shaarli_rss_export(rss_file):
         ts_str = str_between(get_row('published'), '<published>', '</published>')
         ts_str = str_between(get_row('published'), '<published>', '</published>')
         time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
         time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
 
 
-
         info = {
         info = {
             'url': url,
             'url': url,
             'domain': domain(url),
             'domain': domain(url),