Browse Source

bump timeouts and improve curl archive method

Nick Sweeting 7 years ago
parent
commit
7ea36c4adb
2 changed files with 18 additions and 8 deletions
  1. 17 7
      archivebox/archive_methods.py
  2. 1 1
      archivebox/util.py

+ 17 - 7
archivebox/archive_methods.py

@@ -214,6 +214,7 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
         '--span-hosts',
         '--no-parent',
         '--restrict-file-names=unix',
+        f'--timeout={timeout}',
         *(('--warc-file={}'.format(warc_path),) if warc else ()),
         *(('--page-requisites',) if FETCH_WGET_REQUISITES else ()),
         *(('--user-agent="{}"'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()),
@@ -222,7 +223,7 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
     ]
     end = progress(timeout, prefix='      ')
     try:
-        result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1)  # index.html
+        result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 5)  # index.html
         end()
         output = wget_output_path(link, look_in=domain_dir)
 
@@ -265,13 +266,13 @@ def fetch_pdf(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DI
         *chrome_headless(user_data_dir=user_data_dir),
         '--print-to-pdf',
         '--hide-scrollbars',
-        '--timeout=58000',
+        '--timeout={timeout * 1000}',
         *(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')),
         link['url']
     ]
     end = progress(timeout, prefix='      ')
     try:
-        result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1)  # output.pdf
+        result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 5)  # output.pdf
         end()
         if result.returncode:
             print('     ', (result.stderr or result.stdout).decode())
@@ -304,14 +305,14 @@ def fetch_screenshot(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_
         '--screenshot',
         '--window-size={}'.format(resolution),
         '--hide-scrollbars',
-        '--timeout=58000',
+        '--timeout={timeout * 1000}',
         *(() if CHECK_SSL_VALIDITY else ('--disable-web-security', '--ignore-certificate-errors')),
         # '--full-page',   # TODO: make this actually work using ./bin/screenshot fullPage: true
         link['url'],
     ]
     end = progress(timeout, prefix='      ')
     try:
-        result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1)  # sreenshot.png
+        result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 5)  # sreenshot.png
         end()
         if result.returncode:
             print('     ', (result.stderr or result.stdout).decode())
@@ -344,12 +345,13 @@ def fetch_dom(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DI
     CMD = [
         *chrome_headless(user_data_dir=user_data_dir),
         '--dump-dom',
+        '--timeout={timeout * 1000}',
         link['url']
     ]
     end = progress(timeout, prefix='      ')
     try:
         with open(output_path, 'w+') as f:
-            result = run(CMD, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout + 1)  # output.html
+            result = run(CMD, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout + 5)  # output.html
         end()
         if result.returncode:
             print('     ', (result.stderr).decode())
@@ -379,7 +381,15 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
     submit_url = 'https://web.archive.org/save/{}'.format(link['url'])
 
     success = False
-    CMD = ['curl', '-L', '-I', '-X', 'GET', submit_url]
+    CMD = [
+        'curl',
+        '--location',
+        '--head',
+        '--max-time', str(timeout),
+        '--get',
+        *(() if CHECK_SSL_VALIDITY else ('--insecure',)),
+        submit_url,
+    ]
     end = progress(timeout, prefix='      ')
     try:
         result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1)  # archive.org.txt

+ 1 - 1
archivebox/util.py

@@ -123,7 +123,7 @@ def progress(seconds=TIMEOUT, prefix=''):
     chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#'
     chunks = TERM_WIDTH - len(prefix) - 20  # number of progress chunks to show (aka max bar width)
 
-    def progress_bar(seconds=seconds, prefix=prefix):
+    def progress_bar(seconds, prefix):
         """show timer in the form of progress bar, with percentage and seconds remaining"""
         try:
             for s in range(seconds * chunks):