Nick Sweeting %!s(int64=5) %!d(string=hai) anos
pai
achega
ac9e0e356d

+ 3 - 4
archivebox/config.py

@@ -885,32 +885,31 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
         stderr('')
         stderr('')
 
 
     if config['TIMEOUT'] < 5:
     if config['TIMEOUT'] < 5:
-        stderr()
         stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
         stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
         stderr('    You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.')
         stderr('    You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.')
         stderr('    (Setting it to somewhere between 30 and 3000 seconds is recommended)')
         stderr('    (Setting it to somewhere between 30 and 3000 seconds is recommended)')
         stderr()
         stderr()
         stderr('    If you want to make ArchiveBox run faster, disable specific archive methods instead:')
         stderr('    If you want to make ArchiveBox run faster, disable specific archive methods instead:')
         stderr('        https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles')
         stderr('        https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles')
+        stderr()
 
 
     elif config['USE_CHROME'] and config['TIMEOUT'] < 15:
     elif config['USE_CHROME'] and config['TIMEOUT'] < 15:
-        stderr()
         stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
         stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
         stderr('    Chrome will fail to archive all sites if set to less than ~15 seconds.')
         stderr('    Chrome will fail to archive all sites if set to less than ~15 seconds.')
         stderr('    (Setting it to somewhere between 30 and 300 seconds is recommended)')
         stderr('    (Setting it to somewhere between 30 and 300 seconds is recommended)')
         stderr()
         stderr()
         stderr('    If you want to make ArchiveBox run faster, disable specific archive methods instead:')
         stderr('    If you want to make ArchiveBox run faster, disable specific archive methods instead:')
         stderr('        https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles')
         stderr('        https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles')
+        stderr()
 
 
     if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20:
     if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20:
-        stderr()
         stderr(f'[!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={config["MEDIA_TIMEOUT"]} seconds)', color='red')
         stderr(f'[!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={config["MEDIA_TIMEOUT"]} seconds)', color='red')
         stderr('    Youtube-dl will fail to archive all media if set to less than ~20 seconds.')
         stderr('    Youtube-dl will fail to archive all media if set to less than ~20 seconds.')
         stderr('    (Setting it somewhere over 60 seconds is recommended)')
         stderr('    (Setting it somewhere over 60 seconds is recommended)')
         stderr()
         stderr()
         stderr('    If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
         stderr('    If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
         stderr('        https://github.com/pirate/ArchiveBox/wiki/Configuration#save_media')
         stderr('        https://github.com/pirate/ArchiveBox/wiki/Configuration#save_media')
-
+        stderr()
         
         
 def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) -> None:
 def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) -> None:
     output_dir = out_dir or config['OUTPUT_DIR']
     output_dir = out_dir or config['OUTPUT_DIR']

+ 11 - 3
archivebox/core/admin.py

@@ -11,7 +11,7 @@ from django.shortcuts import render, redirect
 from django.contrib.auth import get_user_model
 from django.contrib.auth import get_user_model
 from django import forms
 from django import forms
 
 
-from core.models import Snapshot
+from core.models import Snapshot, Tag
 from core.forms import AddLinkForm, TagField
 from core.forms import AddLinkForm, TagField
 from core.utils import get_icons
 from core.utils import get_icons
 
 
@@ -109,8 +109,9 @@ class SnapshotAdmin(admin.ModelAdmin):
     def title_str(self, obj):
     def title_str(self, obj):
         canon = obj.as_link().canonical_outputs()
         canon = obj.as_link().canonical_outputs()
         tags = ''.join(
         tags = ''.join(
-            format_html(' <a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.id, tag)
+            format_html('<a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.id, tag)
             for tag in obj.tags.all()
             for tag in obj.tags.all()
+            if str(tag).strip()
         )
         )
         return format_html(
         return format_html(
             '<a href="/{}">'
             '<a href="/{}">'
@@ -124,7 +125,7 @@ class SnapshotAdmin(admin.ModelAdmin):
             obj.archive_path,
             obj.archive_path,
             'fetched' if obj.latest_title or obj.title else 'pending',
             'fetched' if obj.latest_title or obj.title else 'pending',
             urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
             urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
-        ) + mark_safe(f'<span class="tags">{tags}</span>')
+        ) + mark_safe(f' <span class="tags">{tags}</span>')
 
 
     def files(self, obj):
     def files(self, obj):
         return get_icons(obj)
         return get_icons(obj)
@@ -151,6 +152,12 @@ class SnapshotAdmin(admin.ModelAdmin):
     title_str.admin_order_field = 'title'
     title_str.admin_order_field = 'title'
     url_str.admin_order_field = 'url'
     url_str.admin_order_field = 'url'
 
 
+class TagAdmin(admin.ModelAdmin):
+    list_display = ('slug', 'name', 'id')
+    sort_fields = ('id', 'name', 'slug')
+    readonly_fields = ('id',)
+    search_fields = ('id', 'name', 'slug')
+    fields = (*readonly_fields, 'name', 'slug')
 
 
 
 
 class ArchiveBoxAdmin(admin.AdminSite):
 class ArchiveBoxAdmin(admin.AdminSite):
@@ -206,4 +213,5 @@ class ArchiveBoxAdmin(admin.AdminSite):
 admin.site = ArchiveBoxAdmin()
 admin.site = ArchiveBoxAdmin()
 admin.site.register(get_user_model())
 admin.site.register(get_user_model())
 admin.site.register(Snapshot, SnapshotAdmin)
 admin.site.register(Snapshot, SnapshotAdmin)
+admin.site.register(Tag, TagAdmin)
 admin.site.disable_action('delete_selected')
 admin.site.disable_action('delete_selected')

+ 3 - 7
archivebox/core/models.py

@@ -82,7 +82,7 @@ class Snapshot(models.Model):
         args = args or self.keys
         args = args or self.keys
         return {
         return {
             key: getattr(self, key)
             key: getattr(self, key)
-            if key != 'tags' else self.get_tags_str()
+            if key != 'tags' else self.tags_str()
             for key in args 
             for key in args 
         }
         }
 
 
@@ -93,12 +93,8 @@ class Snapshot(models.Model):
         from ..index import load_link_details
         from ..index import load_link_details
         return load_link_details(self.as_link())
         return load_link_details(self.as_link())
     
     
-    def get_tags_str(self) -> str:
-        tags = ','.join(
-            tag.name
-            for tag in self.tags.all()
-        ) if self.tags.all() else ''
-        return tags
+    def tags_str(self) -> str:
+        return ','.join(self.tags.order_by('name').values_list('name', flat=True))
 
 
     @cached_property
     @cached_property
     def bookmarked(self):
     def bookmarked(self):

+ 1 - 0
archivebox/core/settings.py

@@ -25,6 +25,7 @@ IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3]
 ### Django Core Settings
 ### Django Core Settings
 ################################################################################
 ################################################################################
 
 
+DEBUG = True
 WSGI_APPLICATION = 'core.wsgi.application'
 WSGI_APPLICATION = 'core.wsgi.application'
 ROOT_URLCONF = 'core.urls'
 ROOT_URLCONF = 'core.urls'
 
 

+ 3 - 3
archivebox/core/utils.py

@@ -13,26 +13,26 @@ def get_icons(snapshot: Snapshot) -> str:
     # slow version: highlights icons based on whether files exist or not for that output
     # slow version: highlights icons based on whether files exist or not for that output
     # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
     # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
     # fast version: all icons are highlighted without checking for outputs in filesystem
     # fast version: all icons are highlighted without checking for outputs in filesystem
-    link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method])
+    link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
 
 
     return format_html(
     return format_html(
             '<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">'
             '<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">'
                 '<a href="/{}/{}" class="exists-{}" title="Wget clone">🌐 </a> '
                 '<a href="/{}/{}" class="exists-{}" title="Wget clone">🌐 </a> '
+                '<a href="/{}/{}" class="exists-{}" title="SingleFile">&#128476; </a>'
                 '<a href="/{}/{}" class="exists-{}" title="PDF">📄</a> '
                 '<a href="/{}/{}" class="exists-{}" title="PDF">📄</a> '
                 '<a href="/{}/{}" class="exists-{}" title="Screenshot">🖥 </a> '
                 '<a href="/{}/{}" class="exists-{}" title="Screenshot">🖥 </a> '
                 '<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '
                 '<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '
                 '<a href="/{}/{}" class="exists-{}" title="WARC">🆆 </a> '
                 '<a href="/{}/{}" class="exists-{}" title="WARC">🆆 </a> '
-                '<a href="/{}/{}" class="exists-{}" title="SingleFile">&#128476; </a>'
                 '<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '
                 '<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '
                 '<a href="/{}/{}/" class="exists-{}" title="Git repos">📦 </a> '
                 '<a href="/{}/{}/" class="exists-{}" title="Git repos">📦 </a> '
                 '<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> '
                 '<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> '
             '</span>',
             '</span>',
             *link_tuple(link, 'wget_path'),
             *link_tuple(link, 'wget_path'),
+            *link_tuple(link, 'singlefile_path'),
             *link_tuple(link, 'pdf_path'),
             *link_tuple(link, 'pdf_path'),
             *link_tuple(link, 'screenshot_path'),
             *link_tuple(link, 'screenshot_path'),
             *link_tuple(link, 'dom_path'),
             *link_tuple(link, 'dom_path'),
             *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
             *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
-            *link_tuple(link, 'singlefile_path'),
             *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
             *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
             *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
             *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
             canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),
             canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),

+ 1 - 1
archivebox/extractors/archive_org.py

@@ -59,7 +59,7 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
         result = run(cmd, cwd=str(out_dir), timeout=timeout)
         result = run(cmd, cwd=str(out_dir), timeout=timeout)
         content_location, errors = parse_archive_dot_org_response(result.stdout)
         content_location, errors = parse_archive_dot_org_response(result.stdout)
         if content_location:
         if content_location:
-            archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
+            archive_org_url = content_location[0]
         elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
         elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
             archive_org_url = None
             archive_org_url = None
             # raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url)))
             # raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url)))

+ 1 - 3
archivebox/extractors/headers.py

@@ -50,11 +50,9 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
         link.url,
         link.url,
     ]
     ]
     try:
     try:
-        json_headers = get_headers(link.url)
-
+        json_headers = get_headers(link.url, timeout=timeout)
         output_folder.mkdir(exist_ok=True)
         output_folder.mkdir(exist_ok=True)
         atomic_write(str(output_folder / "headers.json"), json_headers)
         atomic_write(str(output_folder / "headers.json"), json_headers)
-
     except (Exception, OSError) as err:
     except (Exception, OSError) as err:
         status = 'failed'
         status = 'failed'
         output = err
         output = err

+ 34 - 21
archivebox/extractors/mercury.py

@@ -2,7 +2,8 @@ __package__ = 'archivebox.extractors'
 
 
 from pathlib import Path
 from pathlib import Path
 
 
-from typing import Optional
+from subprocess import CompletedProcess
+from typing import Optional, Tuple, List
 import json
 import json
 
 
 from ..index.schema import Link, ArchiveResult, ArchiveError
 from ..index.schema import Link, ArchiveResult, ArchiveError
@@ -20,6 +21,21 @@ from ..config import (
 )
 )
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
 
 
+
+
+@enforce_types
+def ShellError(cmd: List[str], result: CompletedProcess, lines: int=20) -> ArchiveError:
+    # parse out last line of stderr
+    return ArchiveError(
+        f'Got {cmd[0]} response code: {result.returncode}).',
+        *(
+            line.strip()
+            for line in (result.stdout + result.stderr).decode().rsplit('\n', lines)[-lines:]
+            if line.strip()
+        ),
+    )
+
+
 @enforce_types
 @enforce_types
 def should_save_mercury(link: Link, out_dir: Optional[str]=None) -> bool:
 def should_save_mercury(link: Link, out_dir: Optional[str]=None) -> bool:
     out_dir = out_dir or link.link_dir
     out_dir = out_dir or link.link_dir
@@ -31,7 +47,7 @@ def should_save_mercury(link: Link, out_dir: Optional[str]=None) -> bool:
 
 
 
 
 @enforce_types
 @enforce_types
-def save_mercury(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
     """download reader friendly version using @postlight/mercury-parser"""
     """download reader friendly version using @postlight/mercury-parser"""
 
 
     out_dir = Path(out_dir or link.link_dir)
     out_dir = Path(out_dir or link.link_dir)
@@ -41,41 +57,38 @@ def save_mercury(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
     status = 'succeeded'
     status = 'succeeded'
     timer = TimedProgress(timeout, prefix='      ')
     timer = TimedProgress(timeout, prefix='      ')
     try:
     try:
+        # Get plain text version of article
         cmd = [
         cmd = [
             DEPENDENCIES['MERCURY_BINARY']['path'],
             DEPENDENCIES['MERCURY_BINARY']['path'],
             link.url,
             link.url,
             "--format=text"
             "--format=text"
         ]
         ]
         result = run(cmd, cwd=out_dir, timeout=timeout)
         result = run(cmd, cwd=out_dir, timeout=timeout)
-        txtresult_json = json.loads(result.stdout)
-
+        try:
+            article_text = json.loads(result.stdout)
+        except json.JSONDecodeError:
+            raise ShellError(cmd, result)
+        
+        # Get HTML version of article
         cmd = [
         cmd = [
             DEPENDENCIES['MERCURY_BINARY']['path'],
             DEPENDENCIES['MERCURY_BINARY']['path'],
             link.url
             link.url
         ]
         ]
         result = run(cmd, cwd=out_dir, timeout=timeout)
         result = run(cmd, cwd=out_dir, timeout=timeout)
-        result_json = json.loads(result.stdout)
+        try:
+            article_json = json.loads(result.stdout)
+        except json.JSONDecodeError:
+            raise ShellError(cmd, result)
 
 
         output_folder.mkdir(exist_ok=True)
         output_folder.mkdir(exist_ok=True)
-        atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
-        atomic_write(str(output_folder / "content.txt"), txtresult_json["content"])
-        atomic_write(str(output_folder / "article.json"), result_json)
-
-        # parse out last line of stderr
-        output_tail = [
-            line.strip()
-            for line in (result.stdout + result.stderr).decode().rsplit('\n', 20)[-20:]
-            if line.strip()
-        ]
-        hints = (
-            'Got mercury response code: {}.'.format(result.returncode),
-            *output_tail,
-        )
+        atomic_write(str(output_folder / "content.html"), article_json.pop("content"))
+        atomic_write(str(output_folder / "content.txt"), article_text["content"])
+        atomic_write(str(output_folder / "article.json"), article_json)
 
 
         # Check for common failure cases
         # Check for common failure cases
         if (result.returncode > 0):
         if (result.returncode > 0):
-            raise ArchiveError('Mercury parser was not able to archive the page', hints)
-    except (Exception, OSError) as err:
+            raise ShellError(cmd, result)
+    except (ArchiveError, Exception, OSError) as err:
         status = 'failed'
         status = 'failed'
         output = err
         output = err
     finally:
     finally: