%!s(int64=5) %!d(string=hai) anos · ac9e0e356d
--- a/archivebox/config.py
+++ b/archivebox/config.py
@@ -885,32 +885,31 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
 
															         stderr('')
														
 
															     if config['TIMEOUT'] < 5:
														
 
															-        stderr()
														
 
															         stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
														
 
															         stderr('    You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.')
														
 
															         stderr('    (Setting it to somewhere between 30 and 3000 seconds is recommended)')
														
 
															         stderr()
														
 
															         stderr('    If you want to make ArchiveBox run faster, disable specific archive methods instead:')
														
 
															         stderr('        https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles')
														
 
															+        stderr()
														
 
															     elif config['USE_CHROME'] and config['TIMEOUT'] < 15:
														
 
															-        stderr()
														
 
															         stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
														
 
															         stderr('    Chrome will fail to archive all sites if set to less than ~15 seconds.')
														
 
															         stderr('    (Setting it to somewhere between 30 and 300 seconds is recommended)')
														
 
															         stderr()
														
 
															         stderr('    If you want to make ArchiveBox run faster, disable specific archive methods instead:')
														
 
															         stderr('        https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles')
														
 
															+        stderr()
														
 
															     if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20:
														
 
															-        stderr()
														
 
															         stderr(f'[!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={config["MEDIA_TIMEOUT"]} seconds)', color='red')
														
 
															         stderr('    Youtube-dl will fail to archive all media if set to less than ~20 seconds.')
														
 
															         stderr('    (Setting it somewhere over 60 seconds is recommended)')
														
 
															         stderr()
														
 
															         stderr('    If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
														
 
															         stderr('        https://github.com/pirate/ArchiveBox/wiki/Configuration#save_media')
														
 
															-
														
 
															+        stderr()
														
 
															 def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) -> None:
														
 
															     output_dir = out_dir or config['OUTPUT_DIR']
														
--- a/archivebox/core/admin.py
+++ b/archivebox/core/admin.py
@@ -11,7 +11,7 @@ from django.shortcuts import render, redirect
 
															 from django.contrib.auth import get_user_model
														
 
															 from django import forms
														
 
															-from core.models import Snapshot
														
 
															+from core.models import Snapshot, Tag
														
 
															 from core.forms import AddLinkForm, TagField
														
 
															 from core.utils import get_icons
														
@@ -109,8 +109,9 @@ class SnapshotAdmin(admin.ModelAdmin):
 
															     def title_str(self, obj):
														
 
															         canon = obj.as_link().canonical_outputs()
														
 
															         tags = ''.join(
														
 
															-            format_html(' <a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.id, tag)
														
 
															+            format_html('<a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.id, tag)
														
 
															             for tag in obj.tags.all()
														
 
															+            if str(tag).strip()
														
 
															         )
														
 
															         return format_html(
														
 
															             '<a href="/{}">'
														
@@ -124,7 +125,7 @@ class SnapshotAdmin(admin.ModelAdmin):
 
															             obj.archive_path,
														
 
															             'fetched' if obj.latest_title or obj.title else 'pending',
														
 
															             urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
														
 
															-        ) + mark_safe(f'<span class="tags">{tags}</span>')
														
 
															+        ) + mark_safe(f' <span class="tags">{tags}</span>')
														
 
															     def files(self, obj):
														
 
															         return get_icons(obj)
														
@@ -151,6 +152,12 @@ class SnapshotAdmin(admin.ModelAdmin):
 
															     title_str.admin_order_field = 'title'
														
 
															     url_str.admin_order_field = 'url'
														
 
															+class TagAdmin(admin.ModelAdmin):
														
 
															+    list_display = ('slug', 'name', 'id')
														
 
															+    sort_fields = ('id', 'name', 'slug')
														
 
															+    readonly_fields = ('id',)
														
 
															+    search_fields = ('id', 'name', 'slug')
														
 
															+    fields = (*readonly_fields, 'name', 'slug')
														
 
															 class ArchiveBoxAdmin(admin.AdminSite):
														
@@ -206,4 +213,5 @@ class ArchiveBoxAdmin(admin.AdminSite):
 
															 admin.site = ArchiveBoxAdmin()
														
 
															 admin.site.register(get_user_model())
														
 
															 admin.site.register(Snapshot, SnapshotAdmin)
														
 
															+admin.site.register(Tag, TagAdmin)
														
 
															 admin.site.disable_action('delete_selected')
														
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -82,7 +82,7 @@ class Snapshot(models.Model):
 
															         args = args or self.keys
														
 
															         return {
														
 
															             key: getattr(self, key)
														
 
															-            if key != 'tags' else self.get_tags_str()
														
 
															+            if key != 'tags' else self.tags_str()
														
 
															             for key in args 
														
 
															         }
														
@@ -93,12 +93,8 @@ class Snapshot(models.Model):
 
															         from ..index import load_link_details
														
 
															         return load_link_details(self.as_link())
														
 
															-    def get_tags_str(self) -> str:
														
 
															-        tags = ','.join(
														
 
															-            tag.name
														
 
															-            for tag in self.tags.all()
														
 
															-        ) if self.tags.all() else ''
														
 
															-        return tags
														
 
															+    def tags_str(self) -> str:
														
 
															+        return ','.join(self.tags.order_by('name').values_list('name', flat=True))
														
 
															     @cached_property
														
 
															     def bookmarked(self):
														
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -25,6 +25,7 @@ IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3]
 
															 ### Django Core Settings
														
 
															 ################################################################################
														
 
															+DEBUG = True
														
 
															 WSGI_APPLICATION = 'core.wsgi.application'
														
 
															 ROOT_URLCONF = 'core.urls'
														
--- a/archivebox/core/utils.py
+++ b/archivebox/core/utils.py
@@ -13,26 +13,26 @@ def get_icons(snapshot: Snapshot) -> str:
 
															     # slow version: highlights icons based on whether files exist or not for that output
														
 
															     # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
														
 
															     # fast version: all icons are highlighted without checking for outputs in filesystem
														
 
															-    link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method])
														
 
															+    link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
														
 
															     return format_html(
														
 
															             '<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">'
														
 
															                 '<a href="/{}/{}" class="exists-{}" title="Wget clone">🌐 </a> '
														
 
															+                '<a href="/{}/{}" class="exists-{}" title="SingleFile">&#128476; </a>'
														
 
															                 '<a href="/{}/{}" class="exists-{}" title="PDF">📄</a> '
														
 
															                 '<a href="/{}/{}" class="exists-{}" title="Screenshot">🖥 </a> '
														
 
															                 '<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '
														
 
															                 '<a href="/{}/{}" class="exists-{}" title="WARC">🆆 </a> '
														
 
															-                '<a href="/{}/{}" class="exists-{}" title="SingleFile">&#128476; </a>'
														
 
															                 '<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '
														
 
															                 '<a href="/{}/{}/" class="exists-{}" title="Git repos">📦 </a> '
														
 
															                 '<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> '
														
 
															             '</span>',
														
 
															             *link_tuple(link, 'wget_path'),
														
 
															+            *link_tuple(link, 'singlefile_path'),
														
 
															             *link_tuple(link, 'pdf_path'),
														
 
															             *link_tuple(link, 'screenshot_path'),
														
 
															             *link_tuple(link, 'dom_path'),
														
 
															             *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
														
 
															-            *link_tuple(link, 'singlefile_path'),
														
 
															             *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
														
 
															             *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
														
 
															             canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),
														
--- a/archivebox/extractors/archive_org.py
+++ b/archivebox/extractors/archive_org.py
@@ -59,7 +59,7 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
 
															         result = run(cmd, cwd=str(out_dir), timeout=timeout)
														
 
															         content_location, errors = parse_archive_dot_org_response(result.stdout)
														
 
															         if content_location:
														
 
															-            archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
														
 
															+            archive_org_url = content_location[0]
														
 
															         elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
														
 
															             archive_org_url = None
														
 
															             # raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url)))
														
--- a/archivebox/extractors/headers.py
+++ b/archivebox/extractors/headers.py
@@ -50,11 +50,9 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
 
															         link.url,
														
 
															     ]
														
 
															     try:
														
 
															-        json_headers = get_headers(link.url)
														
 
															-
														
 
															+        json_headers = get_headers(link.url, timeout=timeout)
														
 
															         output_folder.mkdir(exist_ok=True)
														
 
															         atomic_write(str(output_folder / "headers.json"), json_headers)
														
 
															-
														
 
															     except (Exception, OSError) as err:
														
 
															         status = 'failed'
														
 
															         output = err
														
--- a/archivebox/extractors/mercury.py
+++ b/archivebox/extractors/mercury.py
@@ -2,7 +2,8 @@ __package__ = 'archivebox.extractors'
 
															 from pathlib import Path
														
 
															-from typing import Optional
														
 
															+from subprocess import CompletedProcess
														
 
															+from typing import Optional, Tuple, List
														
 
															 import json
														
 
															 from ..index.schema import Link, ArchiveResult, ArchiveError
														
@@ -20,6 +21,21 @@ from ..config import (
 
															 )
														
 
															 from ..logging_util import TimedProgress
														
 
															+
														
 
															+
														
 
															+@enforce_types
														
 
															+def ShellError(cmd: List[str], result: CompletedProcess, lines: int=20) -> ArchiveError:
														
 
															+    # parse out last line of stderr
														
 
															+    return ArchiveError(
														
 
															+        f'Got {cmd[0]} response code: {result.returncode}).',
														
 
															+        *(
														
 
															+            line.strip()
														
 
															+            for line in (result.stdout + result.stderr).decode().rsplit('\n', lines)[-lines:]
														
 
															+            if line.strip()
														
 
															+        ),
														
 
															+    )
														
 
															+
														
 
															+
														
 
															 @enforce_types
														
 
															 def should_save_mercury(link: Link, out_dir: Optional[str]=None) -> bool:
														
 
															     out_dir = out_dir or link.link_dir
														
@@ -31,7 +47,7 @@ def should_save_mercury(link: Link, out_dir: Optional[str]=None) -> bool:
 
															 @enforce_types
														
 
															-def save_mercury(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
														
 
															+def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
														
 
															     """download reader friendly version using @postlight/mercury-parser"""
														
 
															     out_dir = Path(out_dir or link.link_dir)
														
@@ -41,41 +57,38 @@ def save_mercury(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
 
															     status = 'succeeded'
														
 
															     timer = TimedProgress(timeout, prefix='      ')
														
 
															     try:
														
 
															+        # Get plain text version of article
														
 
															         cmd = [
														
 
															             DEPENDENCIES['MERCURY_BINARY']['path'],
														
 
															             link.url,
														
 
															             "--format=text"
														
 
															         ]
														
 
															         result = run(cmd, cwd=out_dir, timeout=timeout)
														
 
															-        txtresult_json = json.loads(result.stdout)
														
 
															-
														
 
															+        try:
														
 
															+            article_text = json.loads(result.stdout)
														
 
															+        except json.JSONDecodeError:
														
 
															+            raise ShellError(cmd, result)
														
 
															+        
														
 
															+        # Get HTML version of article
														
 
															         cmd = [
														
 
															             DEPENDENCIES['MERCURY_BINARY']['path'],
														
 
															             link.url
														
 
															         ]
														
 
															         result = run(cmd, cwd=out_dir, timeout=timeout)
														
 
															-        result_json = json.loads(result.stdout)
														
 
															+        try:
														
 
															+            article_json = json.loads(result.stdout)
														
 
															+        except json.JSONDecodeError:
														
 
															+            raise ShellError(cmd, result)
														
 
															         output_folder.mkdir(exist_ok=True)
														
 
															-        atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
														
 
															-        atomic_write(str(output_folder / "content.txt"), txtresult_json["content"])
														
 
															-        atomic_write(str(output_folder / "article.json"), result_json)
														
 
															-
														
 
															-        # parse out last line of stderr
														
 
															-        output_tail = [
														
 
															-            line.strip()
														
 
															-            for line in (result.stdout + result.stderr).decode().rsplit('\n', 20)[-20:]
														
 
															-            if line.strip()
														
 
															-        ]
														
 
															-        hints = (
														
 
															-            'Got mercury response code: {}.'.format(result.returncode),
														
 
															-            *output_tail,
														
 
															-        )
														
 
															+        atomic_write(str(output_folder / "content.html"), article_json.pop("content"))
														
 
															+        atomic_write(str(output_folder / "content.txt"), article_text["content"])
														
 
															+        atomic_write(str(output_folder / "article.json"), article_json)
														
 
															         # Check for common failure cases
														
 
															         if (result.returncode > 0):
														
 
															-            raise ArchiveError('Mercury parser was not able to archive the page', hints)
														
 
															-    except (Exception, OSError) as err:
														
 
															+            raise ShellError(cmd, result)
														
 
															+    except (ArchiveError, Exception, OSError) as err:
														
 
															         status = 'failed'
														
 
															         output = err
														
 
															     finally: