فهرست منبع

add HTTP byte range request support to media file serving

Nick Sweeting 1 سال پیش
والد
کامیت
c76c50e71f
4فایلهای تغییر یافته به همراه181 افزوده شده و 8 حذف شده
  1. 1 1
      archivebox/cli/__init__.py
  2. 169 0
      archivebox/core/serve_static.py
  3. 7 6
      archivebox/core/urls.py
  4. 4 1
      archivebox/core/views.py

+ 1 - 1
archivebox/cli/__init__.py

@@ -68,7 +68,7 @@ def wait_for_bg_threads_to_exit(thread_names: Iterable[str]=(), ignore_names: It
         else:
         else:
             return tries
             return tries
 
 
-    raise Exception('Background threads failed to exit after {tries}s: {threads_summary}')
+    raise Exception(f'Background threads failed to exit after {tries}s: {threads_summary}')
 
 
 
 
 def list_subcommands() -> Dict[str, str]:
 def list_subcommands() -> Dict[str, str]:

+ 169 - 0
archivebox/core/serve_static.py

@@ -0,0 +1,169 @@
+import os
+import stat
+import posixpath
+import mimetypes
+from pathlib import Path
+
+from django.contrib.staticfiles import finders
+from django.views import static
+from django.http import StreamingHttpResponse, Http404, HttpResponse, HttpResponseNotModified
+from django.utils._os import safe_join
+from django.utils.http import http_date
+from django.utils.translation import gettext as _
+
+
+def serve_static_with_byterange_support(request, path, document_root=None, show_indexes=False):
+    """
+    Overrides Django's built-in django.views.static.serve function to support byte range requests.
+    This allows you to do things like seek into the middle of a huge mp4 or WACZ without downloading the whole file.
+    https://github.com/satchamo/django/commit/2ce75c5c4bee2a858c0214d136bfcd351fcde11d
+    """
+    assert document_root
+    path = posixpath.normpath(path).lstrip("/")
+    fullpath = Path(safe_join(document_root, path))
+    if fullpath.is_dir():
+        if show_indexes:
+            return static.directory_index(path, fullpath)
+        raise Http404(_("Directory indexes are not allowed here."))
+    if not fullpath.exists():
+        raise Http404(_("“%(path)s” does not exist") % {"path": fullpath})
+    
+    # Respect the If-Modified-Since header.
+    statobj = fullpath.stat()
+    if not static.was_modified_since(request.META.get("HTTP_IF_MODIFIED_SINCE"), statobj.st_mtime):
+        return HttpResponseNotModified()
+    
+    content_type, encoding = mimetypes.guess_type(str(fullpath))
+    content_type = content_type or "application/octet-stream"
+    
+    # setup resposne object
+    ranged_file = RangedFileReader(open(fullpath, "rb"))
+    response = StreamingHttpResponse(ranged_file, content_type=content_type)
+    response.headers["Last-Modified"] = http_date(statobj.st_mtime)
+
+    # handle byte-range requests by serving chunk of file    
+    if stat.S_ISREG(statobj.st_mode):
+        size = statobj.st_size
+        response["Content-Length"] = size
+        response["Accept-Ranges"] = "bytes"
+        response["X-Django-Ranges-Supported"] = "1"
+        # Respect the Range header.
+        if "HTTP_RANGE" in request.META:
+            try:
+                ranges = parse_range_header(request.META['HTTP_RANGE'], size)
+            except ValueError:
+                ranges = None
+            # only handle syntactically valid headers, that are simple (no
+            # multipart byteranges)
+            if ranges is not None and len(ranges) == 1:
+                start, stop = ranges[0]
+                if stop > size:
+                    # requested range not satisfiable
+                    return HttpResponse(status=416)
+                ranged_file.start = start
+                ranged_file.stop = stop
+                response["Content-Range"] = "bytes %d-%d/%d" % (start, stop - 1, size)
+                response["Content-Length"] = stop - start
+                response.status_code = 206
+    if encoding:
+        response.headers["Content-Encoding"] = encoding
+    return response
+
+
+def serve_static(request, path, **kwargs):
+    """
+    Serve static files below a given point in the directory structure or
+    from locations inferred from the staticfiles finders.
+
+    To use, put a URL pattern such as::
+
+        from django.contrib.staticfiles import views
+
+        path('<path:path>', views.serve)
+
+    in your URLconf.
+
+    It uses the django.views.static.serve() view to serve the found files.
+    """
+
+    normalized_path = posixpath.normpath(path).lstrip("/")
+    absolute_path = finders.find(normalized_path)
+    if not absolute_path:
+        if path.endswith("/") or path == "":
+            raise Http404("Directory indexes are not allowed here.")
+        raise Http404("'%s' could not be found" % path)
+    document_root, path = os.path.split(absolute_path)
+    return serve_static_with_byterange_support(request, path, document_root=document_root, **kwargs)
+
+
+def parse_range_header(header, resource_size):
+    """
+    Parses a range header into a list of two-tuples (start, stop) where `start`
+    is the starting byte of the range (inclusive) and `stop` is the ending byte
+    position of the range (exclusive).
+    Returns None if the value of the header is not syntatically valid.
+    https://github.com/satchamo/django/commit/2ce75c5c4bee2a858c0214d136bfcd351fcde11d
+    """
+    if not header or "=" not in header:
+        return None
+
+    ranges = []
+    units, range_ = header.split("=", 1)
+    units = units.strip().lower()
+
+    if units != "bytes":
+        return None
+
+    for val in range_.split(","):
+        val = val.strip()
+        if "-" not in val:
+            return None
+
+        if val.startswith("-"):
+            # suffix-byte-range-spec: this form specifies the last N bytes of an
+            # entity-body
+            start = resource_size + int(val)
+            if start < 0:
+                start = 0
+            stop = resource_size
+        else:
+            # byte-range-spec: first-byte-pos "-" [last-byte-pos]
+            start, stop = val.split("-", 1)
+            start = int(start)
+            # the +1 is here since we want the stopping point to be exclusive, whereas in
+            # the HTTP spec, the last-byte-pos is inclusive
+            stop = int(stop) + 1 if stop else resource_size
+            if start >= stop:
+                return None
+
+        ranges.append((start, stop))
+
+    return ranges
+
+
+class RangedFileReader:
+    """
+    Wraps a file like object with an iterator that runs over part (or all) of
+    the file defined by start and stop. Blocks of block_size will be returned
+    from the starting position, up to, but not including the stop point.
+    https://github.com/satchamo/django/commit/2ce75c5c4bee2a858c0214d136bfcd351fcde11d
+    """
+
+    block_size = 8192
+
+    def __init__(self, file_like, start=0, stop=float("inf"), block_size=None):
+        self.f = file_like
+        self.block_size = block_size or RangedFileReader.block_size
+        self.start = start
+        self.stop = stop
+
+    def __iter__(self):
+        self.f.seek(self.start)
+        position = self.start
+        while position < self.stop:
+            data = self.f.read(min(self.block_size, self.stop - position))
+            if not data:
+                break
+
+            yield data
+            position += self.block_size

+ 7 - 6
archivebox/core/urls.py

@@ -1,14 +1,13 @@
 __package__ = 'archivebox.core'
 __package__ = 'archivebox.core'
 
 
-from django.urls import path, include
+from django.urls import path, re_path, include
 from django.views import static
 from django.views import static
-from django.contrib.staticfiles.urls import staticfiles_urlpatterns
 from django.conf import settings
 from django.conf import settings
 from django.views.generic.base import RedirectView
 from django.views.generic.base import RedirectView
 
 
 from .admin import archivebox_admin
 from .admin import archivebox_admin
 from .views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
 from .views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
-
+from .serve_static import serve_static
 
 
 # GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
 # GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
 # from config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
 # from config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
@@ -18,13 +17,16 @@ from .views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthC
 # print('DEBUG', settings.DEBUG)
 # print('DEBUG', settings.DEBUG)
 
 
 urlpatterns = [
 urlpatterns = [
-    path('public/', PublicIndexView.as_view(), name='public-index'),
+    re_path(r"^static/(?P<path>.*)$", serve_static),
+    # re_path(r"^media/(?P<path>.*)$", static.serve, {"document_root": settings.MEDIA_ROOT}),
 
 
     path('robots.txt', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'robots.txt'}),
     path('robots.txt', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'robots.txt'}),
     path('favicon.ico', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'favicon.ico'}),
     path('favicon.ico', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'favicon.ico'}),
 
 
     path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'),
     path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'),
 
 
+    path('public/', PublicIndexView.as_view(), name='public-index'),
+    
     path('archive/', RedirectView.as_view(url='/')),
     path('archive/', RedirectView.as_view(url='/')),
     path('archive/<path:path>', SnapshotView.as_view(), name='Snapshot'),
     path('archive/<path:path>', SnapshotView.as_view(), name='Snapshot'),
 
 
@@ -41,7 +43,7 @@ urlpatterns = [
     path("api/",      include('api.urls'), name='api'),
     path("api/",      include('api.urls'), name='api'),
 
 
     path('health/', HealthCheckView.as_view(), name='healthcheck'),
     path('health/', HealthCheckView.as_view(), name='healthcheck'),
-    path('error/', lambda *_: 1/0),
+    path('error/', lambda *_: 1/0),                                             # type: ignore
 
 
     # path('jet_api/', include('jet_django.urls')),  Enable to use https://www.jetadmin.io/integrations/django
     # path('jet_api/', include('jet_django.urls')),  Enable to use https://www.jetadmin.io/integrations/django
 
 
@@ -49,7 +51,6 @@ urlpatterns = [
     path('index.json', static.serve, {'document_root': settings.CONFIG.OUTPUT_DIR, 'path': 'index.json'}),
     path('index.json', static.serve, {'document_root': settings.CONFIG.OUTPUT_DIR, 'path': 'index.json'}),
     path('', HomepageView.as_view(), name='Home'),
     path('', HomepageView.as_view(), name='Home'),
 ]
 ]
-urlpatterns += staticfiles_urlpatterns()
 
 
 if settings.DEBUG_TOOLBAR:
 if settings.DEBUG_TOOLBAR:
     urlpatterns += [path('__debug__/', include("debug_toolbar.urls"))]
     urlpatterns += [path('__debug__/', include("debug_toolbar.urls"))]

+ 4 - 1
archivebox/core/views.py

@@ -46,6 +46,7 @@ from ..main import add
 from ..util import base_url, ansi_to_html, htmlencode, urldecode, urlencode, ts_to_date_str
 from ..util import base_url, ansi_to_html, htmlencode, urldecode, urlencode, ts_to_date_str
 from ..search import query_search_index
 from ..search import query_search_index
 from ..extractors.wget import wget_output_path
 from ..extractors.wget import wget_output_path
+from .serve_static import serve_static_with_byterange_support
 
 
 
 
 class HomepageView(View):
 class HomepageView(View):
@@ -197,7 +198,9 @@ class SnapshotView(View):
                         # if they requested snapshot index, serve live rendered template instead of static html
                         # if they requested snapshot index, serve live rendered template instead of static html
                         response = self.render_live_index(request, snapshot)
                         response = self.render_live_index(request, snapshot)
                     else:
                     else:
-                        response = static.serve(request, archivefile, document_root=snapshot.link_dir, show_indexes=True)
+                        response = serve_static_with_byterange_support(
+                            request, archivefile, document_root=snapshot.link_dir, show_indexes=True,
+                        )
                     response["Link"] = f'<{snapshot.url}>; rel="canonical"'
                     response["Link"] = f'<{snapshot.url}>; rel="canonical"'
                     return response
                     return response
                 except Snapshot.DoesNotExist:
                 except Snapshot.DoesNotExist: