misha 5 år sedan
förälder
incheckning
18f0f66f1e
3 ändrade filer med 8 tillägg och 3 borttagningar
  1. 3 2
      archivebox/archive_methods.py
  2. 4 1
      archivebox/config.py
  3. 1 0
      etc/ArchiveBox.conf.default

+ 3 - 2
archivebox/archive_methods.py

@@ -30,6 +30,7 @@ from config import (
     OUTPUT_DIR,
     GIT_DOMAINS,
     GIT_SHA,
+    CURL_USER_AGENT,
     WGET_USER_AGENT,
     CHECK_SSL_VALIDITY,
     COOKIES_FILE,
@@ -226,7 +227,7 @@ def fetch_wget(link_dir, link, timeout=TIMEOUT):
         '--span-hosts',
         '--no-parent',
         '-e', 'robots=off',
-        '--restrict-file-names=windows',
+        '--restrict-file-names=nocontrol',
         '--timeout={}'.format(timeout),
         *(('--compression=auto',) if WGET_AUTO_COMPRESSION else ()),
         *(() if FETCH_WARC else ('--timestamping',)),
@@ -561,7 +562,7 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
         CURL_BINARY,
         '--location',
         '--head',
-        '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(GIT_SHA),  # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
+	    *(('--user-agent', '{}'.format(CURL_USER_AGENT),) if CURL_USER_AGENT else ()),  # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
         '--max-time', str(timeout),
         *(() if CHECK_SSL_VALIDITY else ('--insecure',)),
         submit_url,

+ 4 - 1
archivebox/config.py

@@ -35,6 +35,7 @@ SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True'
 CHECK_SSL_VALIDITY =     os.getenv('CHECK_SSL_VALIDITY',     'True'             ).lower() == 'true'
 RESOLUTION =             os.getenv('RESOLUTION',             '1440,2000'        )
 GIT_DOMAINS =            os.getenv('GIT_DOMAINS',            'github.com,bitbucket.org,gitlab.com').split(',')
+CURL_USER_AGENT =        os.getenv('CURL_USER_AGENT',        'ArchiveBox/{GIT_SHA} (+https://github.com/pirate/ArchiveBox/)')
 WGET_USER_AGENT =        os.getenv('WGET_USER_AGENT',        'ArchiveBox/{GIT_SHA} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}')
 COOKIES_FILE =           os.getenv('COOKIES_FILE',           None)
 CHROME_USER_DATA_DIR =   os.getenv('CHROME_USER_DATA_DIR',   None)
@@ -192,13 +193,15 @@ try:
         raise
 
     ### Make sure curl is installed
-    if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
+    if FETCH_FAVICON or FETCH_TITLE or SUBMIT_ARCHIVE_DOT_ORG:
         if run(['which', CURL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
             print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
             print('    Install it, then confirm it works with: {} --version'.format(CURL_BINARY))
             print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
             raise SystemExit(1)
 
+        CURL_USER_AGENT = CURL_USER_AGENT.format(GIT_SHA=GIT_SHA[:9])
+
     ### Make sure wget is installed and calculate version
     if FETCH_WGET or FETCH_WARC:
         if run(['which', WGET_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:

+ 1 - 0
etc/ArchiveBox.conf.default

@@ -40,6 +40,7 @@
 #CHECK_SSL_VALIDITY=True
 #FETCH_WGET_REQUISITES=True
 #RESOLUTION="1440,900"
+#CURL_USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36"
 #WGET_USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36"
 #CHROME_USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36"
 #GIT_DOMAINS="github.com,bitbucket.org,gitlab.com"