瀏覽代碼

working consistent list and remove with filtering

Nick Sweeting 6 年之前
父節點
當前提交
d8d8f7c2cc

+ 21 - 14
archivebox/cli/archivebox_list.py

@@ -5,12 +5,11 @@ __command__ = 'archivebox list'
 __description__ = 'List all the URLs currently in the archive.'
 __description__ = 'List all the URLs currently in the archive.'
 
 
 import sys
 import sys
-import json
 import argparse
 import argparse
 
 
 
 
-from ..legacy.util import reject_stdin, ExtendedEncoder
-from ..legacy.main import list_archive_data, csv_format
+from ..legacy.util import reject_stdin, to_json, to_csv
+from ..legacy.main import list_archive_data
 
 
 
 
 def main(args=None):
 def main(args=None):
@@ -33,16 +32,10 @@ def main(args=None):
         action='store_true',
         action='store_true',
         help="Print the output in JSON format with all columns included.",
         help="Print the output in JSON format with all columns included.",
     )
     )
-    parser.add_argument(
-        '--filter', #'-f',
-        type=str,
-        help="List only URLs matching the given regex pattern.",
-        default=None,
-    )
     parser.add_argument(
     parser.add_argument(
         '--sort', #'-s',
         '--sort', #'-s',
         type=str,
         type=str,
-        help="List the links sorted using the given key, e.g. timestamp or updated",
+        help="List the links sorted using the given key, e.g. timestamp or updated.",
         default=None,
         default=None,
     )
     )
     parser.add_argument(
     parser.add_argument(
@@ -57,11 +50,26 @@ def main(args=None):
         help="List only URLs bookmarked after the given timestamp.",
         help="List only URLs bookmarked after the given timestamp.",
         default=None,
         default=None,
     )
     )
+    parser.add_argument(
+        '--filter-type',
+        type=str,
+        choices=('exact', 'substring', 'domain', 'regex'),
+        default='exact',
+        help='Type of pattern matching to use when filtering URLs',
+    )
+    parser.add_argument(
+        'patterns',
+        nargs='*',
+        type=str,
+        default=None,
+        help='List only URLs matching these filter patterns.'
+    )
     command = parser.parse_args(args)
     command = parser.parse_args(args)
     reject_stdin(__command__)
     reject_stdin(__command__)
 
 
     links = list_archive_data(
     links = list_archive_data(
-        filter_regex=command.filter,
+        filter_patterns=command.patterns,
+        filter_type=command.filter_type,
         before=command.before,
         before=command.before,
         after=command.after,
         after=command.after,
     )
     )
@@ -69,10 +77,9 @@ def main(args=None):
         links = sorted(links, key=lambda link: getattr(link, command.sort))
         links = sorted(links, key=lambda link: getattr(link, command.sort))
 
 
     if command.csv:
     if command.csv:
-        print(command.csv)
-        print('\n'.join(csv_format(link, command.csv) for link in links))
+        print(to_csv(links, csv_cols=command.csv.split(','), header=True))
     elif command.json:
     elif command.json:
-        print(json.dumps(list(links), indent=4, cls=ExtendedEncoder))
+        print(to_json(links, indent=4, sort_keys=True))
     else:
     else:
         print('\n'.join(link.url for link in links))
         print('\n'.join(link.url for link in links))
     
     

+ 87 - 0
archivebox/cli/archivebox_remove.py

@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox remove'
+__description__ = 'Remove the specified URLs from the archive.'
+
+import sys
+import argparse
+
+
+from ..legacy.main import list_archive_data, remove_archive_links
+from ..legacy.util import reject_stdin, to_csv, TimedProgress
+from ..legacy.config import ANSI
+
+
+def main(args=None):
+    args = sys.argv[1:] if args is None else args
+
+    parser = argparse.ArgumentParser(
+        prog=__command__,
+        description=__description__,
+        add_help=True,
+    )
+    parser.add_argument(
+        '--yes', # '-y',
+        action='store_true',
+        help='Remove links instantly without prompting to confirm.',
+    )
+    parser.add_argument(
+        '--delete', # '-r',
+        action='store_true',
+        help=(
+            "In addition to removing the link from the index, "
+            "also delete its archived content and metadata folder."
+        ),
+    )
+    parser.add_argument(
+        '--before', #'-b',
+        type=float,
+        help="List only URLs bookmarked before the given timestamp.",
+        default=None,
+    )
+    parser.add_argument(
+        '--after', #'-a',
+        type=float,
+        help="List only URLs bookmarked after the given timestamp.",
+        default=None,
+    )
+    parser.add_argument(
+        '--filter-type',
+        type=str,
+        choices=('exact', 'substring', 'domain', 'regex'),
+        default='exact',
+        help='Type of pattern matching to use when filtering URLs',
+    )
+    parser.add_argument(
+        'pattern',
+        nargs='?',
+        type=str,
+        default=None,
+        help='URLs matching this filter pattern will be removed from the index.'
+    )
+    command = parser.parse_args(args)
+    reject_stdin(__command__)
+
+    if not sys.stdin.isatty():
+        stdin_raw_text = sys.stdin.read()
+        if stdin_raw_text and command.url:
+            print(
+                '[X] You should pass either a pattern as an argument, '
+                'or pass a list of patterns via stdin, but not both.\n'
+            )
+            raise SystemExit(1)
+
+        patterns = [pattern.strip() for pattern in stdin_raw_text.split('\n')]
+    else:
+        patterns = [command.pattern]
+
+    remove_archive_links(
+        filter_patterns=patterns, filter_type=command.filter_type,
+        before=command.before, after=command.after,
+        yes=command.yes, delete=command.delete,
+    )
+    
+
+if __name__ == '__main__':
+    main()

+ 2 - 0
archivebox/legacy/index.py

@@ -15,6 +15,8 @@ from .config import (
     FOOTER_INFO,
     FOOTER_INFO,
     TIMEOUT,
     TIMEOUT,
     URL_BLACKLIST_PTN,
     URL_BLACKLIST_PTN,
+    ANSI,
+    stderr,
 )
 )
 from .util import (
 from .util import (
     scheme,
     scheme,

+ 79 - 10
archivebox/legacy/main.py

@@ -1,10 +1,10 @@
 import re
 import re
-import json
+import shutil
 
 
 from typing import List, Optional, Iterable
 from typing import List, Optional, Iterable
 
 
 from .schema import Link
 from .schema import Link
-from .util import enforce_types, ExtendedEncoder
+from .util import enforce_types, TimedProgress, to_csv
 from .index import (
 from .index import (
     links_after_timestamp,
     links_after_timestamp,
     load_links_index,
     load_links_index,
@@ -12,6 +12,7 @@ from .index import (
 )
 )
 from .archive_methods import archive_link
 from .archive_methods import archive_link
 from .config import (
 from .config import (
+    ANSI,
     ONLY_NEW,
     ONLY_NEW,
     OUTPUT_DIR,
     OUTPUT_DIR,
     check_dependencies,
     check_dependencies,
@@ -61,23 +62,91 @@ def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]
     return all_links
     return all_links
 
 
 
 
+LINK_FILTERS = {
+    'exact': lambda link, pattern: (link.url == pattern) or (link.base_url == pattern),
+    'substring': lambda link, pattern: pattern in link.url,
+    'regex': lambda link, pattern: bool(re.match(pattern, link.url)),
+    'domain': lambda link, pattern: link.domain == pattern,
+}
+
+def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str='exact') -> bool:
+    for pattern in filter_patterns:
+        if LINK_FILTERS[filter_type](link, pattern):
+            return True
+
+    return False
+
+
 @enforce_types
 @enforce_types
-def list_archive_data(filter_regex: Optional[str]=None, after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]:
+def list_archive_data(filter_patterns: Optional[List[str]]=None, filter_type: str='exact',
+                      after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]:
     
     
     all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
     all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
 
 
-    pattern = re.compile(filter_regex, re.IGNORECASE) if filter_regex else None
-
     for link in all_links:
     for link in all_links:
-        if pattern and not pattern.match(link.url):
-            continue
         if after is not None and float(link.timestamp) < after:
         if after is not None and float(link.timestamp) < after:
             continue
             continue
         if before is not None and float(link.timestamp) > before:
         if before is not None and float(link.timestamp) > before:
             continue
             continue
+        
+        if filter_patterns:
+            if link_matches_filter(link, filter_patterns, filter_type):
+                yield link
+        else:
+            yield link
 
 
-        yield link
 
 
+@enforce_types
+def remove_archive_links(filter_patterns: List[str], filter_type: str='exact',
+                         after: Optional[float]=None, before: Optional[float]=None,
+                         yes: bool=False, delete: bool=False):
+    
+    check_dependencies()
+
+    print('[*] Finding links in the archive index matching these {} patterns:'.format(filter_type))
+    print('    {}'.format(' '.join(filter_patterns)))
+    timer = TimedProgress(360, prefix='      ')
+    try:
+        links = list(list_archive_data(
+            filter_patterns=filter_patterns,
+            filter_type=filter_type,
+            after=after,
+            before=before,
+        ))
+    finally:
+        timer.end()
+    if not len(links):
+        print()
+        print('{red}[X] No matching links found.{reset}'.format(**ANSI))
+        raise SystemExit(1)
+
+    print()
+    print('-------------------------------------------------------------------')
+    print(to_csv(links, csv_cols=['link_dir', 'url', 'is_archived', 'num_outputs']))
+    print('-------------------------------------------------------------------')
+    print()
+    if not yes:
+        resp = input('{lightyellow}[?] Are you sure you want to permanently remove these {} archived links? N/y: {reset}'.format(len(links), **ANSI))
+        
+        if not resp.lower() == 'y':
+            raise SystemExit(0)
 
 
-def csv_format(link: Link, csv_cols: str) -> str:
-    return ','.join(json.dumps(getattr(link, col), cls=ExtendedEncoder) for col in csv_cols.split(','))
+    all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
+    to_keep = []
+
+    for link in all_links:
+        should_remove = (
+            (after is not None and float(link.timestamp) < after)
+            or (before is not None and float(link.timestamp) > before)
+            or link_matches_filter(link, filter_patterns, filter_type)
+        )
+        if not should_remove:
+            to_keep.append(link)
+        elif should_remove and delete:
+            shutil.rmtree(link.link_dir)
+
+    num_removed = len(all_links) - len(to_keep)
+    write_links_index(links=to_keep, out_dir=OUTPUT_DIR, finished=True)
+    print()
+    print('{red}[√] Removed {} out of {} links from the archive index.{reset}'.format(num_removed, len(all_links), **ANSI))
+    print('    Index now contains {} links.'.format(len(to_keep)))

+ 36 - 4
archivebox/legacy/schema.py

@@ -50,16 +50,33 @@ class ArchiveResult:
     def from_json(cls, json_info):
     def from_json(cls, json_info):
         from .util import parse_date
         from .util import parse_date
 
 
-        allowed_fields = {f.name for f in fields(cls)}
         info = {
         info = {
             key: val
             key: val
             for key, val in json_info.items()
             for key, val in json_info.items()
-            if key in allowed_fields
+            if key in cls.field_names()
         }
         }
         info['start_ts'] = parse_date(info['start_ts'])
         info['start_ts'] = parse_date(info['start_ts'])
         info['end_ts'] = parse_date(info['end_ts'])
         info['end_ts'] = parse_date(info['end_ts'])
         return cls(**info)
         return cls(**info)
 
 
+    def to_json(self, indent=4, sort_keys=True):
+        from .util import to_json
+
+        return to_json(self, indent=indent, sort_keys=sort_keys)
+
+    def to_csv(self, cols=None):
+        from .util import to_json
+
+        cols = cols or self.field_names()
+        return ','.join(
+            to_json(getattr(self, col), indent=False)
+            for col in cols
+        )
+    
+    @classmethod
+    def field_names(cls):
+        return [f.name for f in fields(cls)]
+
     @property
     @property
     def duration(self) -> int:
     def duration(self) -> int:
         return (self.end_ts - self.start_ts).seconds
         return (self.end_ts - self.start_ts).seconds
@@ -145,11 +162,10 @@ class Link:
     def from_json(cls, json_info):
     def from_json(cls, json_info):
         from .util import parse_date
         from .util import parse_date
         
         
-        allowed_fields = {f.name for f in fields(cls)}
         info = {
         info = {
             key: val
             key: val
             for key, val in json_info.items()
             for key, val in json_info.items()
-            if key in allowed_fields
+            if key in cls.field_names()
         }
         }
         info['updated'] = parse_date(info['updated'])
         info['updated'] = parse_date(info['updated'])
 
 
@@ -166,6 +182,22 @@ class Link:
         info['history'] = cast_history
         info['history'] = cast_history
         return cls(**info)
         return cls(**info)
 
 
+    def to_json(self, indent=4, sort_keys=True):
+        from .util import to_json
+
+        return to_json(self, indent=indent, sort_keys=sort_keys)
+
+    def to_csv(self, csv_cols: List[str]):
+        from .util import to_json
+
+        return ','.join(
+            to_json(getattr(self, col), indent=None)
+            for col in csv_cols
+        )
+
+    @classmethod
+    def field_names(cls):
+        return [f.name for f in fields(cls)]
 
 
     @property
     @property
     def link_dir(self) -> str:
     def link_dir(self) -> str:

+ 16 - 2
archivebox/legacy/util.py

@@ -6,7 +6,7 @@ import time
 import shutil
 import shutil
 
 
 from json import JSONEncoder
 from json import JSONEncoder
-from typing import List, Optional, Any, Union
+from typing import List, Optional, Any, Union, IO
 from inspect import signature
 from inspect import signature
 from functools import wraps
 from functools import wraps
 from hashlib import sha256
 from hashlib import sha256
@@ -616,13 +616,27 @@ class ExtendedEncoder(JSONEncoder):
         return JSONEncoder.default(self, obj)
         return JSONEncoder.default(self, obj)
 
 
 
 
+def to_json(obj: Any, file: IO=None, indent: Optional[int]=4, sort_keys: bool=True, cls=ExtendedEncoder) -> Optional[str]:
+    if file:
+        json.dump(obj, file, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
+        return None
+    else:
+        return json.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
+
+
+def to_csv(links: List[Link], csv_cols: Optional[List[str]]=None, header: bool=True) -> str:
+    csv_cols = csv_cols or ['timestamp', 'is_archived', 'url']
+    header_str = '{}\n'.format(','.join(csv_cols)) if header else ''
+    return header_str + '\n'.join(link.to_csv(csv_cols=csv_cols) for link in links)
+
+
 def atomic_write(contents: Union[dict, str], path: str) -> None:
 def atomic_write(contents: Union[dict, str], path: str) -> None:
     """Safe atomic write to filesystem by writing to temp file + atomic rename"""
     """Safe atomic write to filesystem by writing to temp file + atomic rename"""
     try:
     try:
         tmp_file = '{}.tmp'.format(path)
         tmp_file = '{}.tmp'.format(path)
         with open(tmp_file, 'w+', encoding='utf-8') as f:
         with open(tmp_file, 'w+', encoding='utf-8') as f:
             if isinstance(contents, dict):
             if isinstance(contents, dict):
-                json.dump(contents, f, indent=4, cls=ExtendedEncoder)
+                to_json(contents, file=f)
             else:
             else:
                 f.write(contents)
                 f.write(contents)