Bläddra i källkod

fix rss parsing when items have newlines between them

Nick Sweeting 6 år sedan
förälder
incheckning
58c9b47d43
1 ändrade filer med 3 tillägg och 2 borttagningar
  1. 3 2
      archivebox/parse.py

+ 3 - 2
archivebox/parse.py

@@ -154,7 +154,8 @@ def parse_rss_export(rss_file):
     """Parse RSS XML-format files into links"""
     """Parse RSS XML-format files into links"""
 
 
     rss_file.seek(0)
     rss_file.seek(0)
-    items = rss_file.read().split('</item>\n<item>')
+    items = rss_file.read().split('<item>')
+    items = items[1:] if items else []
     for item in items:
     for item in items:
         # example item:
         # example item:
         # <item>
         # <item>
@@ -166,7 +167,7 @@ def parse_rss_export(rss_file):
         # </item>
         # </item>
 
 
         trailing_removed = item.split('</item>', 1)[0]
         trailing_removed = item.split('</item>', 1)[0]
-        leading_removed = trailing_removed.split('<item>', 1)[-1]
+        leading_removed = trailing_removed.split('<item>', 1)[-1].strip()
         rows = leading_removed.split('\n')
         rows = leading_removed.split('\n')
 
 
         def get_row(key):
         def get_row(key):