6 år sedan · a3705e31c6
--- a/archivebox/archive_methods.py
+++ b/archivebox/archive_methods.py
@@ -33,6 +33,7 @@ from config import (
 
				     WGET_USER_AGENT,
			
 
				     CHECK_SSL_VALIDITY,
			
 
				     COOKIES_FILE,
			
 
				+    WGET_AUTO_COMPRESSION
			
 
				 )
			
 
				 from util import (
			
 
				     domain,
			
@@ -224,10 +225,10 @@ def fetch_wget(link_dir, link, timeout=TIMEOUT):
 
				         '--backup-converted',
			
 
				         '--span-hosts',
			
 
				         '--no-parent',
			
 
				-        '--compression=auto',
			
 
				         '-e', 'robots=off',
			
 
				         '--restrict-file-names=unix',
			
 
				         '--timeout={}'.format(timeout),
			
 
				+        *(('--compression=auto',) if WGET_AUTO_COMPRESSION else ()),
			
 
				         *(() if FETCH_WARC else ('--timestamping',)),
			
 
				         *(('--warc-file={}'.format(warc_path),) if FETCH_WARC else ()),
			
 
				         *(('--page-requisites',) if FETCH_WGET_REQUISITES else ()),
			
--- a/archivebox/config.py
+++ b/archivebox/config.py
@@ -74,6 +74,7 @@ TEMPLATES_DIR = os.path.join(PYTHON_PATH, 'templates')
 
				 CHROME_SANDBOX = os.getenv('CHROME_SANDBOX', 'True').lower() == 'true'
			
 
				 USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM
			
 
				 USE_WGET = FETCH_WGET or FETCH_WGET_REQUISITES or FETCH_WARC
			
 
				+WGET_AUTO_COMPRESSION = USE_WGET and WGET_BINARY and (not run([WGET_BINARY, "--compression=auto", "--help"], stdout=DEVNULL).returncode)
			
 
				 
			
 
				 ########################### Environment & Dependencies #########################
			
 
				 
			
--- a/archivebox/parse.py
+++ b/archivebox/parse.py
@@ -154,7 +154,8 @@ def parse_rss_export(rss_file):
 
				     """Parse RSS XML-format files into links"""
			
 
				 
			
 
				     rss_file.seek(0)
			
 
				-    items = rss_file.read().split('</item>\n<item>')
			
 
				+    items = rss_file.read().split('<item>')
			
 
				+    items = items[1:] if items else []
			
 
				     for item in items:
			
 
				         # example item:
			
 
				         # <item>
			
@@ -166,7 +167,7 @@ def parse_rss_export(rss_file):
 
				         # </item>
			
 
				 
			
 
				         trailing_removed = item.split('</item>', 1)[0]
			
 
				-        leading_removed = trailing_removed.split('<item>', 1)[-1]
			
 
				+        leading_removed = trailing_removed.split('<item>', 1)[-1].strip()
			
 
				         rows = leading_removed.split('\n')
			
 
				 
			
 
				         def get_row(key):