소스 검색

add ability to fetch media

Nick Sweeting 7 년 전
부모
커밋
c33f7ba91c
2개의 변경된 파일56개의 추가작업 그리고 67개의 파일을 삭제
  1. 55 67
      archivebox/archive_methods.py
  2. 1 0
      archivebox/config.py

+ 55 - 67
archivebox/archive_methods.py

@@ -18,6 +18,7 @@ from config import (
     FETCH_SCREENSHOT,
     FETCH_SCREENSHOT,
     FETCH_DOM,
     FETCH_DOM,
     FETCH_GIT,
     FETCH_GIT,
+    FETCH_MEDIA,
     RESOLUTION,
     RESOLUTION,
     CHECK_SSL_VALIDITY,
     CHECK_SSL_VALIDITY,
     SUBMIT_ARCHIVE_DOT_ORG,
     SUBMIT_ARCHIVE_DOT_ORG,
@@ -89,6 +90,9 @@ def archive_link(link_dir, link, overwrite=True):
     
     
     log_link_archive(link_dir, link, update_existing)
     log_link_archive(link_dir, link, update_existing)
 
 
+    if FETCH_FAVICON:
+        link = fetch_favicon(link_dir, link, overwrite=overwrite)
+
     if FETCH_WGET:
     if FETCH_WGET:
         link = fetch_wget(link_dir, link, overwrite=overwrite)
         link = fetch_wget(link_dir, link, overwrite=overwrite)
 
 
@@ -113,8 +117,9 @@ def archive_link(link_dir, link, overwrite=True):
     if FETCH_GIT:
     if FETCH_GIT:
         link = fetch_git(link_dir, link, overwrite=overwrite)
         link = fetch_git(link_dir, link, overwrite=overwrite)
 
 
-    if FETCH_FAVICON:
-        link = fetch_favicon(link_dir, link, overwrite=overwrite)
+    if FETCH_MEDIA:
+        link = fetch_media(link_dir, link, overwrite=overwrite)
+
 
 
     write_link_index(link_dir, link)
     write_link_index(link_dir, link)
     # print()
     # print()
@@ -435,71 +440,54 @@ def fetch_favicon(link_dir, link, timeout=TIMEOUT):
         'output': output,
         'output': output,
     }
     }
 
 
-# @attach_result_to_link('audio')
-# def fetch_audio(link_dir, link, timeout=TIMEOUT):
-#     """Download audio rip using youtube-dl"""
-
-#     if link['type'] not in ('soundcloud',)\
-#        and 'audio' not in link['tags']:
-#         return
-
-#     path = os.path.join(link_dir, 'audio')
-
-#     if not os.path.exists(path) or overwrite:
-#         print('    - Downloading audio')
-#         CMD = [
-#             "youtube-dl -x --audio-format mp3 --audio-quality 0 -o '%(title)s.%(ext)s'",
-#             link['url'],
-#         ]
-#         end = progress(timeout, prefix='      ')
-#         try:
-#             result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1)  # audio/audio.mp3
-#             end()
-#             if result.returncode:
-#                 print('     ', result.stderr.decode())
-#                 raise Exception('Failed to download audio')
-#             chmod_file('audio.mp3', cwd=link_dir)
-#             return 'audio.mp3'
-#         except Exception as e:
-#             end()
-#             print('        Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
-#             print('        {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
-#             raise
-#     else:
-#         print('    √ Skipping audio download')
-
-# @attach_result_to_link('video')
-# def fetch_video(link_dir, link, timeout=TIMEOUT):
-#     """Download video rip using youtube-dl"""
-
-#     if link['type'] not in ('youtube', 'youku', 'vimeo')\
-#        and 'video' not in link['tags']:
-#         return
-
-#     path = os.path.join(link_dir, 'video')
-
-#     if not os.path.exists(path) or overwrite:
-#         print('    - Downloading video')
-#         CMD = [
-#             "youtube-dl -x --video-format mp4 --audio-quality 0 -o '%(title)s.%(ext)s'",
-#             link['url'],
-#         ]
-#         end = progress(timeout, prefix='      ')
-#         try:
-#             result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1)  # video/movie.mp4
-#             end()
-#             if result.returncode:
-#                 print('     ', result.stderr.decode())
-#                 raise Exception('Failed to download video')
-#             chmod_file('video.mp4', cwd=link_dir)
-#             return 'video.mp4'
-#         except Exception as e:
-#             end()
-#             print('        Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
-#             print('        {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
-#             raise
-#     else:
-#         print('    √ Skipping video download')
+@attach_result_to_link('media')
+def fetch_media(link_dir, link, timeout=TIMEOUT, overwrite=False):
+    """Download playlists or individual video, audio, and subtitles using youtube-dl"""
+
+    output = os.path.join(link_dir, 'media')
+
+    if os.path.exists(output) and not overwrite:
+        return {'output': 'media', 'status': 'skipped'}
+
+    os.mkdir(output)
+    print('    - Downloading media')
+    CMD = [
+        'youtube-dl',
+        '--write-description',
+        '--write-info-json',
+        '--write-annotations',
+        '--yes-playlist',
+        '--write-thumbnail ',
+        '--no-call-home',
+        '--no-check-certificate',
+        '--user-agent ',
+        '--all-subs',
+        '-x',
+        '--audio-format', 'mp3',
+        '--audio-quality', '320K',
+        '--embed-thumbnail',
+        '--add-metadata',
+        link['url']
+    ]
+
+    end = progress(timeout, prefix='      ')
+    try:
+        result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=output, timeout=timeout + 1)  # audio/audio.mp3
+        end()
+        if result.returncode:
+            print('        got youtubedl response code {}:'.format(result.returncode))
+            raise Exception('Failed to download media')
+        chmod_file('media', cwd=link_dir)
+        return 'media'
+    except Exception as e:
+        end()
+        print('        Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
+        print('        {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
+
+    return {
+        'cmd': CMD,
+        'output': output,
+    }
 
 
 @attach_result_to_link('git')
 @attach_result_to_link('git')
 def fetch_git(link_dir, link, timeout=TIMEOUT):
 def fetch_git(link_dir, link, timeout=TIMEOUT):

+ 1 - 0
archivebox/config.py

@@ -22,6 +22,7 @@ FETCH_PDF =              os.getenv('FETCH_PDF',              'True'
 FETCH_SCREENSHOT =       os.getenv('FETCH_SCREENSHOT',       'True'             ).lower() == 'true'
 FETCH_SCREENSHOT =       os.getenv('FETCH_SCREENSHOT',       'True'             ).lower() == 'true'
 FETCH_DOM =              os.getenv('FETCH_DOM',              'True'             ).lower() == 'true'
 FETCH_DOM =              os.getenv('FETCH_DOM',              'True'             ).lower() == 'true'
 FETCH_GIT =              os.getenv('FETCH_GIT',              'True'             ).lower() == 'true'
 FETCH_GIT =              os.getenv('FETCH_GIT',              'True'             ).lower() == 'true'
+FETCH_MEDIA =            os.getenv('FETCH_MEDIA',            'True'             ).lower() == 'true'
 FETCH_FAVICON =          os.getenv('FETCH_FAVICON',          'True'             ).lower() == 'true'
 FETCH_FAVICON =          os.getenv('FETCH_FAVICON',          'True'             ).lower() == 'true'
 SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True'             ).lower() == 'true'
 SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True'             ).lower() == 'true'
 RESOLUTION =             os.getenv('RESOLUTION',             '1440,1200'        )
 RESOLUTION =             os.getenv('RESOLUTION',             '1440,1200'        )