media.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. __package__ = 'archivebox.extractors'
  2. from pathlib import Path
  3. from typing import Optional
  4. from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
  5. from ..system import run, chmod_file
  6. from ..util import (
  7. enforce_types,
  8. is_static_file,
  9. )
  10. from ..config import (
  11. MEDIA_TIMEOUT,
  12. SAVE_MEDIA,
  13. YOUTUBEDL_ARGS,
  14. YOUTUBEDL_BINARY,
  15. YOUTUBEDL_VERSION,
  16. CHECK_SSL_VALIDITY
  17. )
  18. from ..logging_util import TimedProgress
  19. @enforce_types
  20. def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
  21. if is_static_file(link.url):
  22. return False
  23. out_dir = out_dir or Path(link.link_dir)
  24. if not overwrite and (out_dir / 'media').exists():
  25. return False
  26. return SAVE_MEDIA
  27. @enforce_types
  28. def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
  29. """Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""
  30. out_dir = out_dir or Path(link.link_dir)
  31. output: ArchiveOutput = 'media'
  32. output_path = out_dir / output
  33. output_path.mkdir(exist_ok=True)
  34. cmd = [
  35. YOUTUBEDL_BINARY,
  36. *YOUTUBEDL_ARGS,
  37. *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
  38. # TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR}
  39. link.url,
  40. ]
  41. status = 'succeeded'
  42. timer = TimedProgress(timeout, prefix=' ')
  43. try:
  44. result = run(cmd, cwd=str(output_path), timeout=timeout + 1)
  45. chmod_file(output, cwd=str(out_dir))
  46. if result.returncode:
  47. if (b'ERROR: Unsupported URL' in result.stderr
  48. or b'HTTP Error 404' in result.stderr
  49. or b'HTTP Error 403' in result.stderr
  50. or b'URL could be a direct video link' in result.stderr
  51. or b'Unable to extract container ID' in result.stderr):
  52. # These happen too frequently on non-media pages to warrant printing to console
  53. pass
  54. else:
  55. hints = (
  56. 'Got youtube-dl (or yt-dlp) response code: {}.'.format(result.returncode),
  57. *result.stderr.decode().split('\n'),
  58. )
  59. raise ArchiveError('Failed to save media', hints)
  60. except Exception as err:
  61. status = 'failed'
  62. output = err
  63. finally:
  64. timer.end()
  65. # add video description and subtitles to full-text index
  66. # Let's try a few different
  67. index_texts = [
  68. # errors:
  69. # * 'strict' to raise a ValueError exception if there is an
  70. # encoding error. The default value of None has the same effect.
  71. # * 'ignore' ignores errors. Note that ignoring encoding errors
  72. # can lead to data loss.
  73. # * 'xmlcharrefreplace' is only supported when writing to a
  74. # file. Characters not supported by the encoding are replaced with
  75. # the appropriate XML character reference &#nnn;.
  76. # There are a few more options described in https://docs.python.org/3/library/functions.html#open
  77. text_file.read_text(encoding='utf-8', errors='xmlcharrefreplace').strip()
  78. for text_file in (
  79. *output_path.glob('*.description'),
  80. *output_path.glob('*.srt'),
  81. *output_path.glob('*.vtt'),
  82. *output_path.glob('*.lrc'),
  83. *output_path.glob('*.lrc'),
  84. )
  85. ]
  86. return ArchiveResult(
  87. cmd=cmd,
  88. pwd=str(out_dir),
  89. cmd_version=YOUTUBEDL_VERSION,
  90. output=output,
  91. status=status,
  92. index_texts=index_texts,
  93. **timer.stats,
  94. )