Browse Source

Don't be strict on unicode errors

Joseph Turian 3 years ago
parent
commit
b864c38d9e
1 changed files with 10 additions and 1 deletions
  1. 10 1
      archivebox/extractors/media.py

+ 10 - 1
archivebox/extractors/media.py

@@ -74,7 +74,16 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
     # add video description and subtitles to full-text index
     # add video description and subtitles to full-text index
     # Let's try a few different 
     # Let's try a few different 
     index_texts = [
     index_texts = [
-        text_file.read_text(encoding='utf-8').strip()
+        # errors:
+        # * 'strict' to raise a ValueError exception if there is an
+        #   encoding error. The default value of None has the same effect.
+        # * 'ignore' ignores errors. Note that ignoring encoding errors
+        #   can lead to data loss.
+        # * 'xmlcharrefreplace' is only supported when writing to a
+        #   file. Characters not supported by the encoding are replaced with
+        #   the appropriate XML character reference &#nnn;.
+        # There are a few more options described in https://docs.python.org/3/library/functions.html#open
+        text_file.read_text(encoding='utf-8', errors='xmlcharrefreplace').strip()
         for text_file in (
         for text_file in (
             *output_path.glob('*.description'),
             *output_path.glob('*.description'),
             *output_path.glob('*.srt'),
             *output_path.glob('*.srt'),