Browse Source

fix media download with longer timeout

Nick Sweeting 7 years ago
parent
commit
a15a331798
2 changed files with 18 additions and 12 deletions
  1. 16 11
      archivebox/archive_methods.py
  2. 2 1
      archivebox/config.py

+ 16 - 11
archivebox/archive_methods.py

@@ -29,6 +29,7 @@ from config import (
     CHROME_USER_DATA_DIR,
     CHROME_SANDBOX,
     TIMEOUT,
+    MEDIA_TIMEOUT,
     ANSI,
     ARCHIVE_DIR,
     GIT_DOMAINS,
@@ -441,28 +442,29 @@ def fetch_favicon(link_dir, link, timeout=TIMEOUT):
     }
 
 @attach_result_to_link('media')
-def fetch_media(link_dir, link, timeout=TIMEOUT, overwrite=False):
+def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
     """Download playlists or individual video, audio, and subtitles using youtube-dl"""
 
-    output = os.path.join(link_dir, 'media')
 
+    # import ipdb; ipdb.set_trace()
+    output = os.path.join(link_dir, 'media')
     if os.path.exists(output) and not overwrite:
         return {'output': 'media', 'status': 'skipped'}
 
-    os.mkdir(output)
-    print('    - Downloading media')
+    os.makedirs(output, exist_ok=True)
     CMD = [
         'youtube-dl',
         '--write-description',
         '--write-info-json',
         '--write-annotations',
         '--yes-playlist',
-        '--write-thumbnail ',
+        '--write-thumbnail',
         '--no-call-home',
         '--no-check-certificate',
-        '--user-agent ',
+        '--user-agent',
         '--all-subs',
         '-x',
+        '-k',
         '--audio-format', 'mp3',
         '--audio-quality', '320K',
         '--embed-thumbnail',
@@ -472,17 +474,20 @@ def fetch_media(link_dir, link, timeout=TIMEOUT, overwrite=False):
 
     end = progress(timeout, prefix='      ')
     try:
-        result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=output, timeout=timeout + 1)  # audio/audio.mp3
+        result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output, timeout=timeout + 1)  # audio/audio.mp3
         end()
         if result.returncode:
-            print('        got youtubedl response code {}:'.format(result.returncode))
-            raise Exception('Failed to download media')
-        chmod_file('media', cwd=link_dir)
-        return 'media'
+            if b'ERROR: Unsupported URL' in result.stderr:
+                # print('        none found')
+                pass
+            else:
+                print('        got youtubedl response code {}:'.format(result.returncode))
+                raise Exception('Failed to download media')
     except Exception as e:
         end()
         print('        Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
         print('        {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
+        output = e
 
     return {
         'cmd': CMD,

+ 2 - 1
archivebox/config.py

@@ -22,7 +22,7 @@ FETCH_PDF =              os.getenv('FETCH_PDF',              'True'
 FETCH_SCREENSHOT =       os.getenv('FETCH_SCREENSHOT',       'True'             ).lower() == 'true'
 FETCH_DOM =              os.getenv('FETCH_DOM',              'True'             ).lower() == 'true'
 FETCH_GIT =              os.getenv('FETCH_GIT',              'True'             ).lower() == 'true'
-FETCH_MEDIA =            os.getenv('FETCH_MEDIA',            'True'             ).lower() == 'true'
+FETCH_MEDIA =            os.getenv('FETCH_MEDIA',            'False'            ).lower() == 'true'
 FETCH_FAVICON =          os.getenv('FETCH_FAVICON',          'True'             ).lower() == 'true'
 SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True'             ).lower() == 'true'
 RESOLUTION =             os.getenv('RESOLUTION',             '1440,1200'        )
@@ -33,6 +33,7 @@ WGET_BINARY =            os.getenv('WGET_BINARY',            'wget'
 WGET_USER_AGENT =        os.getenv('WGET_USER_AGENT',        'ArchiveBox')
 CHROME_USER_DATA_DIR =   os.getenv('CHROME_USER_DATA_DIR',    None)
 TIMEOUT =                int(os.getenv('TIMEOUT',            '60'))
+MEDIA_TIMEOUT =          int(os.getenv('MEDIA_TIMEOUT',      '3600'))
 FOOTER_INFO =            os.getenv('FOOTER_INFO',            'Content is hosted for personal archiving purposes only.  Contact server owner for any takedown requests.',)
 GIT_DOMAINS =            os.getenv('GIT_DOMAINS',            'github.com,bitbucket.org,gitlab.com').split(',')