Browse Source

feat: Refactor remove command to use querysets

Cristian 5 years ago
parent
commit
a8ed72501d
5 changed files with 69 additions and 76 deletions
  1. 1 1
      archivebox/core/admin.py
  2. 10 11
      archivebox/index/__init__.py
  3. 3 3
      archivebox/index/sql.py
  4. 3 4
      archivebox/logging_util.py
  5. 52 57
      archivebox/main.py

+ 1 - 1
archivebox/core/admin.py

@@ -50,7 +50,7 @@ def verify_snapshots(modeladmin, request, queryset):
 verify_snapshots.short_description = "Check"
 verify_snapshots.short_description = "Check"
 
 
 def delete_snapshots(modeladmin, request, queryset):
 def delete_snapshots(modeladmin, request, queryset):
-    remove(links=[snapshot.as_link() for snapshot in queryset], yes=True, delete=True, out_dir=OUTPUT_DIR)
+    remove(snapshots=queryset, yes=True, delete=True, out_dir=OUTPUT_DIR)
 
 
 delete_snapshots.short_description = "Delete"
 delete_snapshots.short_description = "Delete"
 
 

+ 10 - 11
archivebox/index/__init__.py

@@ -11,7 +11,7 @@ from typing import List, Tuple, Dict, Optional, Iterable
 from collections import OrderedDict
 from collections import OrderedDict
 from contextlib import contextmanager
 from contextlib import contextmanager
 from urllib.parse import urlparse
 from urllib.parse import urlparse
-from django.db.models import QuerySet
+from django.db.models import QuerySet, Q
 
 
 from ..util import (
 from ..util import (
     scheme,
     scheme,
@@ -370,19 +370,19 @@ def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link:
 
 
 
 
 LINK_FILTERS = {
 LINK_FILTERS = {
-    'exact': lambda link, pattern: (link.url == pattern) or (link.base_url == pattern),
-    'substring': lambda link, pattern: pattern in link.url,
-    'regex': lambda link, pattern: bool(re.match(pattern, link.url)),
-    'domain': lambda link, pattern: link.domain == pattern,
+    'exact': lambda pattern: Q(url=pattern),
+    'substring': lambda pattern: Q(url__icontains=pattern),
+    'regex': lambda pattern: Q(url__iregex=pattern),
+    'domain': lambda pattern: Q(domain=pattern),
 }
 }
 
 
 @enforce_types
 @enforce_types
-def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str='exact') -> bool:
+def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='exact') -> QuerySet:
+    q_filter = Q()
     for pattern in filter_patterns:
     for pattern in filter_patterns:
         try:
         try:
-            if LINK_FILTERS[filter_type](link, pattern):
-                return True
-        except Exception:
+            q_filter = q_filter | LINK_FILTERS[filter_type](pattern)
+        except KeyError:
             stderr()
             stderr()
             stderr(
             stderr(
                 f'[X] Got invalid pattern for --filter-type={filter_type}:',
                 f'[X] Got invalid pattern for --filter-type={filter_type}:',
@@ -390,8 +390,7 @@ def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str
             )
             )
             stderr(f'    {pattern}')
             stderr(f'    {pattern}')
             raise SystemExit(2)
             raise SystemExit(2)
-
-    return False
+    return snapshots.filter(q_filter)
 
 
 
 
 def get_indexed_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
 def get_indexed_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:

+ 3 - 3
archivebox/index/sql.py

@@ -2,6 +2,7 @@ __package__ = 'archivebox.index'
 
 
 from io import StringIO
 from io import StringIO
 from typing import List, Tuple, Iterator
 from typing import List, Tuple, Iterator
+from django.db.models import QuerySet
 
 
 from .schema import Link
 from .schema import Link
 from ..util import enforce_types
 from ..util import enforce_types
@@ -21,14 +22,13 @@ def parse_sql_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
     )
     )
 
 
 @enforce_types
 @enforce_types
-def remove_from_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
+def remove_from_sql_main_index(snapshots: QuerySet, out_dir: str=OUTPUT_DIR) -> None:
     setup_django(out_dir, check_db=True)
     setup_django(out_dir, check_db=True)
     from core.models import Snapshot
     from core.models import Snapshot
     from django.db import transaction
     from django.db import transaction
 
 
     with transaction.atomic():
     with transaction.atomic():
-        for link in links:
-            Snapshot.objects.filter(url=link.url).delete()
+        snapshots.delete()
 
 
 @enforce_types
 @enforce_types
 def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
 def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:

+ 3 - 4
archivebox/logging_util.py

@@ -408,19 +408,18 @@ def log_removal_started(links: List["Link"], yes: bool, delete: bool):
         except (KeyboardInterrupt, EOFError, AssertionError):
         except (KeyboardInterrupt, EOFError, AssertionError):
             raise SystemExit(0)
             raise SystemExit(0)
 
 
-def log_removal_finished(all_links: int, to_keep: int):
+def log_removal_finished(all_links: int, to_remove: int):
     if all_links == 0:
     if all_links == 0:
         print()
         print()
         print('{red}[X] No matching links found.{reset}'.format(**ANSI))
         print('{red}[X] No matching links found.{reset}'.format(**ANSI))
     else:
     else:
-        num_removed = all_links - to_keep
         print()
         print()
         print('{red}[√] Removed {} out of {} links from the archive index.{reset}'.format(
         print('{red}[√] Removed {} out of {} links from the archive index.{reset}'.format(
-            num_removed,
+            to_remove,
             all_links,
             all_links,
             **ANSI,
             **ANSI,
         ))
         ))
-        print('    Index now contains {} links.'.format(to_keep))
+        print('    Index now contains {} links.'.format(all_links - to_remove))
 
 
 
 
 def log_shell_welcome_msg():
 def log_shell_welcome_msg():

+ 52 - 57
archivebox/main.py

@@ -7,6 +7,7 @@ from pathlib import Path
 
 
 from typing import Dict, List, Optional, Iterable, IO, Union
 from typing import Dict, List, Optional, Iterable, IO, Union
 from crontab import CronTab, CronSlices
 from crontab import CronTab, CronSlices
+from django.db.models import QuerySet
 
 
 from .cli import (
 from .cli import (
     list_subcommands,
     list_subcommands,
@@ -31,7 +32,7 @@ from .index import (
     dedupe_links,
     dedupe_links,
     write_main_index,
     write_main_index,
     write_static_index,
     write_static_index,
-    link_matches_filter,
+    snapshot_filter,
     get_indexed_folders,
     get_indexed_folders,
     get_archived_folders,
     get_archived_folders,
     get_unarchived_folders,
     get_unarchived_folders,
@@ -567,7 +568,7 @@ def add(urls: Union[str, List[str]],
 def remove(filter_str: Optional[str]=None,
 def remove(filter_str: Optional[str]=None,
            filter_patterns: Optional[List[str]]=None,
            filter_patterns: Optional[List[str]]=None,
            filter_type: str='exact',
            filter_type: str='exact',
-           links: Optional[List[Link]]=None,
+           snapshots: Optional[QuerySet]=None,
            after: Optional[float]=None,
            after: Optional[float]=None,
            before: Optional[float]=None,
            before: Optional[float]=None,
            yes: bool=False,
            yes: bool=False,
@@ -577,7 +578,7 @@ def remove(filter_str: Optional[str]=None,
     
     
     check_data_folder(out_dir=out_dir)
     check_data_folder(out_dir=out_dir)
 
 
-    if links is None:
+    if not snapshots:
         if filter_str and filter_patterns:
         if filter_str and filter_patterns:
             stderr(
             stderr(
                 '[X] You should pass either a pattern as an argument, '
                 '[X] You should pass either a pattern as an argument, '
@@ -593,60 +594,54 @@ def remove(filter_str: Optional[str]=None,
             )
             )
             stderr()
             stderr()
             hint(('To remove all urls you can run:',
             hint(('To remove all urls you can run:',
-                  'archivebox remove --filter-type=regex ".*"'))
+                'archivebox remove --filter-type=regex ".*"'))
             stderr()
             stderr()
             raise SystemExit(2)
             raise SystemExit(2)
         elif filter_str:
         elif filter_str:
             filter_patterns = [ptn.strip() for ptn in filter_str.split('\n')]
             filter_patterns = [ptn.strip() for ptn in filter_str.split('\n')]
 
 
-        log_list_started(filter_patterns, filter_type)
-        timer = TimedProgress(360, prefix='      ')
-        try:
-            links = list(list_links(
-                filter_patterns=filter_patterns,
-                filter_type=filter_type,
-                after=after,
-                before=before,
-            ))
-        finally:
-            timer.end()
+    list_kwargs = {
+        "filter_patterns": filter_patterns,
+        "filter_type": filter_type,
+        "after": after,
+        "before": before,
+    }
+    if snapshots:
+        list_kwargs["snapshots"] = snapshots
 
 
+    log_list_started(filter_patterns, filter_type)
+    timer = TimedProgress(360, prefix='      ')
+    try:
+        snapshots = list_links(**list_kwargs)
+    finally:
+        timer.end()
 
 
-    if not len(links):
+
+    if not snapshots.exists():
         log_removal_finished(0, 0)
         log_removal_finished(0, 0)
         raise SystemExit(1)
         raise SystemExit(1)
 
 
 
 
-    log_list_finished(links)
-    log_removal_started(links, yes=yes, delete=delete)
+    log_links = [link.as_link() for link in snapshots]
+    log_list_finished(log_links)
+    log_removal_started(log_links, yes=yes, delete=delete)
 
 
     timer = TimedProgress(360, prefix='      ')
     timer = TimedProgress(360, prefix='      ')
     try:
     try:
-        to_keep = []
-        to_delete = []
-        all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)]
-        for link in all_links:
-            should_remove = (
-                (after is not None and float(link.timestamp) < after)
-                or (before is not None and float(link.timestamp) > before)
-                or link_matches_filter(link, filter_patterns or [], filter_type)
-                or link in links
-            )
-            if should_remove:
-                to_delete.append(link)
-
-                if delete:
-                    shutil.rmtree(link.link_dir, ignore_errors=True)
-            else:
-                to_keep.append(link)
+        for snapshot in snapshots:
+            if delete:
+                shutil.rmtree(snapshot.as_link().link_dir, ignore_errors=True)
     finally:
     finally:
         timer.end()
         timer.end()
 
 
-    remove_from_sql_main_index(links=to_delete, out_dir=out_dir)
-    write_main_index(links=to_keep, out_dir=out_dir, finished=True)
-    log_removal_finished(len(all_links), len(to_keep))
+    to_remove = snapshots.count()
+
+    remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
+    all_snapshots = load_main_index(out_dir=out_dir)
+    write_static_index([link.as_link() for link in all_snapshots], out_dir=out_dir)
+    log_removal_finished(all_snapshots.count(), to_remove)
     
     
-    return to_keep
+    return all_snapshots
 
 
 @enforce_types
 @enforce_types
 def update(resume: Optional[float]=None,
 def update(resume: Optional[float]=None,
@@ -737,18 +732,18 @@ def list_all(filter_patterns_str: Optional[str]=None,
         filter_patterns = filter_patterns_str.split('\n')
         filter_patterns = filter_patterns_str.split('\n')
 
 
 
 
-    links = list_links(
+    snapshots = list_links(
         filter_patterns=filter_patterns,
         filter_patterns=filter_patterns,
         filter_type=filter_type,
         filter_type=filter_type,
         before=before,
         before=before,
         after=after,
         after=after,
     )
     )
 
 
-    if sort:
-        links = sorted(links, key=lambda link: getattr(link, sort))
+    #if sort:
+    #    snapshots = sorted(links, key=lambda link: getattr(link, sort))
 
 
     folders = list_folders(
     folders = list_folders(
-        links=list(links),
+        links=[snapshot.as_link() for snapshot in snapshots],
         status=status,
         status=status,
         out_dir=out_dir,
         out_dir=out_dir,
     )
     )
@@ -758,7 +753,8 @@ def list_all(filter_patterns_str: Optional[str]=None,
 
 
 
 
 @enforce_types
 @enforce_types
-def list_links(filter_patterns: Optional[List[str]]=None,
+def list_links(snapshots: Optional[QuerySet]=None,
+               filter_patterns: Optional[List[str]]=None,
                filter_type: str='exact',
                filter_type: str='exact',
                after: Optional[float]=None,
                after: Optional[float]=None,
                before: Optional[float]=None,
                before: Optional[float]=None,
@@ -766,19 +762,18 @@ def list_links(filter_patterns: Optional[List[str]]=None,
     
     
     check_data_folder(out_dir=out_dir)
     check_data_folder(out_dir=out_dir)
 
 
-    all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)]
-
-    for link in all_links:
-        if after is not None and float(link.timestamp) < after:
-            continue
-        if before is not None and float(link.timestamp) > before:
-            continue
-        
-        if filter_patterns:
-            if link_matches_filter(link, filter_patterns, filter_type):
-                yield link
-        else:
-            yield link
+    if snapshots:
+        all_snapshots = snapshots
+    else:
+        all_snapshots = load_main_index(out_dir=out_dir)
+
+    if after is not None:
+        all_snapshots = all_snapshots.filter(timestamp__lt=after)
+    if before is not None:
+        all_snapshots = all_snapshots.filter(timestamp__gt=before)
+    if filter_patterns:
+        all_snapshots = snapshot_filter(all_snapshots, filter_patterns, filter_type)
+    return all_snapshots
 
 
 @enforce_types
 @enforce_types
 def list_folders(links: List[Link],
 def list_folders(links: List[Link],