Nick Sweeting 1 рік тому
батько
коміт
b6520243bc

+ 1 - 0
archivebox/plugins/gallerydl/__init__.py

@@ -0,0 +1 @@
+__package__ = 'archivebox.plugins.replaywebpage'

+ 8 - 0
archivebox/plugins/gallerydl/apps.py

@@ -0,0 +1,8 @@
+from django.apps import AppConfig
+
+
+class ReplayWebPageConfig(AppConfig):
+    label = "ReplayWeb.Page"
+    name = "plugin_replaywebpage"
+    
+    default_auto_field = "django.db.models.BigAutoField"

+ 50 - 0
archivebox/plugins/gallerydl/extractors.py

@@ -0,0 +1,50 @@
+# browsertrix extractor
+
+def save_browsertrix(link, out_dir, timeout, config):
+
+
+	browsertrix_dir = out_dir / 'browsertrix'
+	browsertrix_dir.mkdir(exist_ok=True)
+
+	crawl_id = link.timestamp
+
+	browsertrix_crawler_cmd = [
+		'crawl',
+		f'--url', link.url,
+		f'--collection={crawl_id}',
+		'--scopeType=page',
+		'--generateWACZ',
+		'--text=final-to-warc',
+		'--timeLimit=60',
+	]
+
+	remote_cmd = """
+	rm /tmp/dump.rdb;
+	rm -rf /crawls/collections;
+	mkdir /crawls/collections;
+	env CRAWL_ID={crawl_id} 
+	"""
+
+	local_cmd = ['nc', 'browsertrix', '2222']
+
+	status = 'succeeded'
+    timer = TimedProgress(timeout, prefix='      ')
+    try:
+        result = run(local_cmd, cwd=str(out_dir), input=remote_cmd, timeout=timeout)
+		
+		cmd_output = result.stdout.decode()
+
+		wacz_output_file = Path('/browsertrix/crawls') / crawl_id / f'{crawl_id}'.wacz
+
+		copy_and_overwrite(wacz_output_file, browsertrix_dir / wacz_output_file.name)
+
+
+
+TEMPLATE = """
+
+"""
+
+# rm /tmp/dump.rdb;
+# rm -rf /crawls/collections;
+# mkdir /crawls/collections;
+# env CRAWL_ID=tec2342 crawl --url 'https://example.com' --scopeType page --generateWACZ --collection tec2342 --text final-to-warc --timeLimit 60

+ 182 - 0
archivebox/plugins/gallerydl/models.py

@@ -0,0 +1,182 @@
+from solo.models import SingletonModel
+
+
+class GalleryDLDependency(SingletonModel):
+    GALLERYDL_ENABLED = models.BooleanField(default=True)
+    GALLERYDL_BINARY = models.CharField(max_length=255, default='gallery-dl')
+
+    def __str__(self):
+        return "GalleryDL Dependency Configuration"
+
+    class Meta:
+        verbose_name = "GalleryDL Dependency Configuration"
+
+    @cached_property
+    def bin_path(self):
+        return bin_path(self.GALLERYDL_BINARY)
+
+    @cached_property
+    def bin_version(self):
+        return bin_version(self.bin_path)
+
+    @cached_property
+    def is_valid(self):
+        return self.bin_path and self.bin_version
+
+    @cached_property
+    def enabled(self):
+        return self.GALLERYDL_ENABLED and self.is_valid
+
+
+    def pretty_version(self):
+        if self.enabled:
+            if self.is_valid:
+                color, symbol, note, version = 'green', '√', 'valid', ''
+
+                parsed_version_num = re.search(r'[\d\.]+', self.bin_version)
+                if parsed_version_num:
+                    version = f'v{parsed_version_num[0]}'
+
+            if not self.bin_version:
+                color, symbol, note, version = 'red', 'X', 'invalid', '?'
+        else:
+            color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
+
+        path = pretty_path(self.bin_path)
+
+        return ' '.join((
+            ANSI[color],
+            symbol,
+            ANSI['reset'],
+            name.ljust(21),
+            version.ljust(14),
+            ANSI[color],
+            note.ljust(8),
+            ANSI['reset'],
+            path.ljust(76),
+        ))
+
+
+
+class GalleryDLExtractor(SingletonModel):
+    GALLERYDL_EXTRACTOR_NAME = 'gallerydl'
+
+    SAVE_GALLERYDL = models.BooleanField(default=True)
+
+    GALLERYDL_DEPENDENCY = GalleryDLDependency.get_solo()
+
+    # https://github.com/mikf/gallery-dl
+    GALLERYDL_ARGS = models.CSVField(max_length=255, default=[])
+    GALLERYDL_TIMEOUT = models.IntegerField(default=lambda c: c['TIMEOUT'])
+    GALLERYDL_USER_AGENT = models.CharField(max_length=255, default='{USER_AGENT}')
+    GALLERYDL_COOKIES_TXT = models.CharField(max_length=255, default='{COOKIES_TXT}')
+
+    ALIASES = {
+        'SAVE_GALLERYDL': ('USE_GALLERYDL', 'FETCH_GALLERYDL'),
+    }
+
+    @cached_property
+    def enabled(self):
+        return self.SAVE_GALLERYDL and self.GALLERYDL_DEPENDENCY.is_valid
+
+
+    def __str__(self):
+        return "GalleryDL Extractor Configuration"
+
+    class Meta:
+        verbose_name = "GalleryDL Extractor Configuration"
+
+    def __json__(self):
+        return {
+            'SAVE_GALLERYDL': self.SAVE_GALLERYDL,
+            'GALLERYDL_DEPENDENCY': self.GALLERYDL_DEPENDENCY.__json__(),
+            'GALLERYDL_ARGS': self.GALLERYDL_ARGS,
+            'GALLERYDL_TIMEOUT': self.GALLERYDL_TIMEOUT,
+            'GALLERYDL_USER_AGENT': self.GALLERYDL_USER_AGENT,
+            'GALLERYDL_COOKIES_TXT': self.GALLERYDL_COOKIES_TXT,
+        }
+
+    def validate(self):
+        assert 5 < self.GALLERYDL_TIMEOUT, 'GALLERYDL_TIMEOUT must be at least 5 seconds'
+        # assert Path(self.GALLERYDL_COOKIES_TXT).exists()
+        # TODO: validate user agent with uaparser
+        # TODO: validate args, cookies.txt?
+
+
+    def save(self, *args, **kwargs):
+        self.validate()
+        with transaction.atomic():
+            result = super().save(*args, **kwargs)
+            emit_event({'type': 'GalleryDLExtractor.save', 'diff': self.__json__(), 'kwargs': kwargs})
+            # potential consumers of this event:
+            #    - event logger: write to events.log
+            #    - config file updater: writes to ArchiveBox.conf
+            #    - supervisor: restarts relevant dependencies/extractors
+            #    - etc...
+
+        return result
+
+
+    def create_extractor_directory(self, parent_dir: Path):
+        return subdir = (parent_dir / self.GALLERYDL_EXTRACTOR_NAME).mkdir(exist_ok=True)
+
+    def should_extract(self, parent_dir: Path):
+        existing_files = (parent_dir / self.GALLERYDL_EXTRACTOR_NAME).glob('*')
+        return not existing_files
+
+
+    def extract(self, url: str, out_dir: Path):
+        if not self.enabled:
+            return
+
+        extractor_dir = self.create_extractor_directory(out_dir)
+
+        cmd = [
+            self.GALLERYDL_DEPENDENCY.bin_path,
+            url,
+            '--timeout', GALLERYDL_TIMEOUT,
+            '--cookies', GALLERYDL_COOKIES_TXT,
+            '--user-agent', GALLERYDL_USER_AGENT,
+            '--verify', config.CHECK_SSL_VALIDITY
+            *self.GALLERYDL_ARGS,
+        ]
+
+        status, stdout, stderr, output_path = 'failed', '', '', None
+        timer = TimedProgress(timeout, prefix='      ')
+        try:
+            proc = run(cmd, cwd=extractor_dir, timeout=self.GALLERYDL_TIMEOUT, text=True)
+            stdout, stderr = proc.stdout, proc.stderr
+            
+            if 'ERROR: Unsupported URL' in stderr:
+                hints = ('gallery-dl doesnt support this type of url yet',)
+                raise ArchiveError('Failed to save gallerydl', hints)
+
+            if proc.returncode == 0 and 'finished' in stdout:
+                output_path = extractor_dir / 'index.html'
+                status = 'succeeded'
+
+        except Exception as err:
+            stderr += err
+        finally:
+            timer.end()
+
+        num_bytes, num_dirs, num_files = get_dir_size(extractor_dir)
+
+        return ArchiveResult(
+            status=status,
+
+            cmd=cmd,
+            pwd=str(out_dir),
+            cmd_version=self.GALLERYDL_DEPENDENCY.bin_version,
+            cmd_path=self.GALLERYDL_DEPENDENCY.bin_path,
+            cmd_hostname=config.HOSTNAME,
+
+            output_path=output_path,
+            stdout=stdout,
+            stderr=stderr,
+
+            num_bytes=num_bytes,
+            num_files=num_files,
+            num_dirs=num_dirs,
+            **timer.stats,
+        )

Різницю між файлами не показано, бо вона завелика
+ 6 - 0
archivebox/plugins/gallerydl/static/sw.js


+ 1 - 0
archivebox/plugins/gallerydl/static/test.txt

@@ -0,0 +1 @@
+test content this should be visible

BIN
archivebox/plugins/gallerydl/static/test.wacz


Різницю між файлами не показано, бо вона завелика
+ 6 - 0
archivebox/plugins/gallerydl/static/ui.js


+ 40 - 0
archivebox/plugins/gallerydl/templates/plugin_replaywebpage__viewer.html

@@ -0,0 +1,40 @@
+{% load tz core_tags static %}
+
+<!DOCTYPE html>
+<html lang="en">
+    <head>
+        <title>{{title}}</title>
+        <meta charset="utf-8" name="viewport" content="width=device-width, initial-scale=1">
+      
+        </style>
+        <style>
+            html, body {
+                width: 100%;
+                height: 100%;
+                background-color: #ddd;
+            }
+        </style>
+    </head>
+    <body>
+        ReplayWeb.page for: {{snapshot.url}} ({{timestamp}}) /{{warc_filename}}
+
+        {{snapshot}}
+
+        <script>
+            // https://cdn.jsdelivr.net/npm/[email protected]/sw.min.js
+            // https://cdn.jsdelivr.net/npm/[email protected]/ui.min.js
+        </script>
+
+        <style>
+        </style>
+        <script src="/static/ui.js"></script>
+
+        <replay-web-page
+            style="height: 600px"
+            embed="replay"
+            replayBase="/static/"
+            source="/static/test.wacz"
+            url="https://example.com/">
+        </replay-web-page>
+    </body>
+</html>

+ 12 - 0
archivebox/plugins/gallerydl/urls.py

@@ -0,0 +1,12 @@
+from django.urls import path
+
+from .views import GalleryDLIconView, GalleryDLEmbedView, GalleryDLOutputView, GalleryDLDependencyView, GalleryDLExtractorView
+
+urlpatterns = [
+	path('/plugins/gallerydl/icon/<path:path>', GalleryDLIconView(.as_view), name='gallerydl_icon'),
+	path('/plugins/gallerydl/embed/<path:path>', GalleryDLEmbedView.as_view(), name='gallerydl_embed'),
+	path('/plugins/gallerydl/output/<path:path>', GalleryDLOutputView.as_view(), name='gallerydl_output'),
+
+	path('/plugins/gallerydl/dependency/', GalleryDLDependencyView.as_view(), name='gallerydl_dependency'),
+	path('/plugins/gallerydl/extractor/', GalleryDLExtractorView.as_view(), name='gallerydl_extractor'),
+]

+ 78 - 0
archivebox/plugins/gallerydl/views.py

@@ -0,0 +1,78 @@
+import os
+import sys
+from pathlib import Path
+
+from django.views import View
+from django.shortcuts import render
+from django.db.models import Q
+
+from core.models import Snapshot
+
+# from archivebox.config import PUBLIC_SNAPSHOTS
+PUBLIC_SNAPSHOTS = True
+
+
+class GalleryDLIconView(View):
+    template_name = 'plugin_gallerydl__icon.html'
+
+    # render static html index from filesystem archive/<timestamp>/index.html
+
+    def get_context_data(self, **kwargs):
+        return {
+            # **super().get_context_data(**kwargs),
+            # 'VERSION': VERSION,
+            # 'COMMIT_HASH': COMMIT_HASH,
+            # 'FOOTER_INFO': FOOTER_INFO,
+        }
+
+
+    def get(self, request, path):
+        if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
+            return redirect(f'/admin/login/?next={request.path}')
+
+        # ...
+        return render(template_name=self.template_name, request=self.request, context=context)
+
+
+class GalleryDLEmbedView(View):
+    template_name = 'plugin_gallerydl__embed.html'
+
+    # render static html index from filesystem archive/<timestamp>/index.html
+
+    def get_context_data(self, **kwargs):
+        return {
+            # **super().get_context_data(**kwargs),
+            # 'VERSION': VERSION,
+            # 'COMMIT_HASH': COMMIT_HASH,
+            # 'FOOTER_INFO': FOOTER_INFO,
+        }
+
+
+    def get(self, request, path):
+        if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
+            return redirect(f'/admin/login/?next={request.path}')
+
+        # ...
+        return render(template_name=self.template_name, request=self.request, context=context)
+
+
+class GalleryDLOutputView(View):
+    template_name = 'plugin_gallerydl__output.html'
+
+    # render static html index from filesystem archive/<timestamp>/index.html
+
+    def get_context_data(self, **kwargs):
+        return {
+            # **super().get_context_data(**kwargs),
+            # 'VERSION': VERSION,
+            # 'COMMIT_HASH': COMMIT_HASH,
+            # 'FOOTER_INFO': FOOTER_INFO,
+        }
+
+
+    def get(self, request, path):
+        if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
+            return redirect(f'/admin/login/?next={request.path}')
+
+        # ...
+        return render(template_name=self.template_name, request=self.request, context=context)

Деякі файли не було показано, через те що забагато файлів було змінено