Browse Source

Merge pull request #555 from cdvv7788/cleanup

Nick Sweeting 5 years ago
parent
commit
04291c4d47
5 changed files with 42 additions and 65 deletions
  1. 4 4
      archivebox/index/__init__.py
  2. 8 0
      archivebox/index/html.py
  3. 15 26
      archivebox/index/json.py
  4. 0 32
      archivebox/logging_util.py
  5. 15 3
      archivebox/main.py

+ 4 - 4
archivebox/index/__init__.py

@@ -383,7 +383,7 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
 
 
 def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
 def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
     """indexed links without checking archive status or data directory validity"""
     """indexed links without checking archive status or data directory validity"""
-    links = [snapshot.as_link() for snapshot in snapshots.iterator()]
+    links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
     return {
     return {
         link.link_dir: link
         link.link_dir: link
         for link in links
         for link in links
@@ -391,7 +391,7 @@ def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
 
 
 def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
 def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
     """indexed links that are archived with a valid data directory"""
     """indexed links that are archived with a valid data directory"""
-    links = [snapshot.as_link() for snapshot in snapshots.iterator()]
+    links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
     return {
     return {
         link.link_dir: link
         link.link_dir: link
         for link in filter(is_archived, links)
         for link in filter(is_archived, links)
@@ -399,7 +399,7 @@ def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
 
 
 def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
 def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
     """indexed links that are unarchived with no data directory or an empty data directory"""
     """indexed links that are unarchived with no data directory or an empty data directory"""
-    links = [snapshot.as_link() for snapshot in snapshots.iterator()]
+    links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
     return {
     return {
         link.link_dir: link
         link.link_dir: link
         for link in filter(is_unarchived, links)
         for link in filter(is_unarchived, links)
@@ -424,7 +424,7 @@ def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
 
 
 def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
 def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
     """dirs with a valid index matched to the main index and archived content"""
     """dirs with a valid index matched to the main index and archived content"""
-    links = [snapshot.as_link() for snapshot in snapshots.iterator()]
+    links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
     return {
     return {
         link.link_dir: link
         link.link_dir: link
         for link in filter(is_valid, links)
         for link in filter(is_valid, links)

+ 8 - 0
archivebox/index/html.py

@@ -46,6 +46,14 @@ def parse_html_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[str]:
                     yield line.split('"')[1]
                     yield line.split('"')[1]
     return ()
     return ()
 
 
+@enforce_types
+def generate_index_from_links(links: List[Link], with_headers: bool):
+    if with_headers:
+        output = main_index_template(links)
+    else:
+        output = main_index_template(links, template=MINIMAL_INDEX_TEMPLATE)
+    return output
+
 @enforce_types
 @enforce_types
 def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) -> str:
 def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) -> str:
     """render the template for the entire main index"""
     """render the template for the entire main index"""

+ 15 - 26
archivebox/index/json.py

@@ -8,7 +8,7 @@ from pathlib import Path
 from datetime import datetime
 from datetime import datetime
 from typing import List, Optional, Iterator, Any, Union
 from typing import List, Optional, Iterator, Any, Union
 
 
-from .schema import Link, ArchiveResult
+from .schema import Link
 from ..system import atomic_write
 from ..system import atomic_write
 from ..util import enforce_types
 from ..util import enforce_types
 from ..config import (
 from ..config import (
@@ -39,7 +39,20 @@ MAIN_INDEX_HEADER = {
     },
     },
 }
 }
 
 
-### Main Links Index
+@enforce_types
+def generate_json_index_from_links(links: List[Link], with_headers: bool):
+    if with_headers:
+        output = {
+            **MAIN_INDEX_HEADER,
+            'num_links': len(links),
+            'updated': datetime.now(),
+            'last_run_cmd': sys.argv,
+            'links': links,
+        }
+    else:
+        output = links
+    return to_json(output, indent=4, sort_keys=True)
+
 
 
 @enforce_types
 @enforce_types
 def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
 def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
@@ -65,30 +78,6 @@ def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
                             continue
                             continue
     return ()
     return ()
 
 
-@enforce_types
-def write_json_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
-    """write the json link index to a given path"""
-
-    assert isinstance(links, List), 'Links must be a list, not a generator.'
-    assert not links or isinstance(links[0].history, dict)
-    assert not links or isinstance(links[0].sources, list)
-
-    if links and links[0].history.get('title'):
-        assert isinstance(links[0].history['title'][0], ArchiveResult)
-
-    if links and links[0].sources:
-        assert isinstance(links[0].sources[0], str)
-
-    main_index_json = {
-        **MAIN_INDEX_HEADER,
-        'num_links': len(links),
-        'updated': datetime.now(),
-        'last_run_cmd': sys.argv,
-        'links': links,
-    }
-    atomic_write(str(Path(out_dir) / JSON_INDEX_FILENAME), main_index_json)
-
-
 ### Link Details Index
 ### Link Details Index
 
 
 @enforce_types
 @enforce_types

+ 0 - 32
archivebox/logging_util.py

@@ -477,39 +477,7 @@ def printable_filesize(num_bytes: Union[int, float]) -> str:
 
 
 @enforce_types
 @enforce_types
 def printable_folders(folders: Dict[str, Optional["Link"]],
 def printable_folders(folders: Dict[str, Optional["Link"]],
-                      json: bool=False,
-                      html: bool=False,
-                      csv: Optional[str]=None,
                       with_headers: bool=False) -> str:
                       with_headers: bool=False) -> str:
-    
-    from .index.json import MAIN_INDEX_HEADER
-
-    links = folders.values()
-    if json: 
-        from .index.json import to_json
-        if with_headers:
-            output = {
-                **MAIN_INDEX_HEADER,
-                'num_links': len(links),
-                'updated': datetime.now(),
-                'last_run_cmd': sys.argv,
-                'links': links,
-            }
-        else:
-            output = links
-        return to_json(output, indent=4, sort_keys=True)
-    elif html:
-        from .index.html import main_index_template
-        if with_headers:
-            output = main_index_template(links)
-        else:
-            from .index.html import MINIMAL_INDEX_TEMPLATE
-            output = main_index_template(links, template=MINIMAL_INDEX_TEMPLATE)
-        return output
-    elif csv:
-        from .index.csv import links_to_csv
-        return links_to_csv(folders.values(), cols=csv.split(','), header=with_headers)
-    
     return '\n'.join(
     return '\n'.join(
         f'{folder} {link and link.url} "{link and link.title}"'
         f'{folder} {link and link.url} "{link and link.title}"'
         for folder, link in folders.items()
         for folder, link in folders.items()

+ 15 - 3
archivebox/main.py

@@ -49,12 +49,17 @@ from .index import (
 from .index.json import (
 from .index.json import (
     parse_json_main_index,
     parse_json_main_index,
     parse_json_links_details,
     parse_json_links_details,
+    generate_json_index_from_links,
 )
 )
 from .index.sql import (
 from .index.sql import (
     get_admins,
     get_admins,
     apply_migrations,
     apply_migrations,
     remove_from_sql_main_index,
     remove_from_sql_main_index,
 )
 )
+from .index.html import (
+    generate_index_from_links,
+)
+from .index.csv import links_to_csv
 from .extractors import archive_links, archive_link, ignore_methods
 from .extractors import archive_links, archive_link, ignore_methods
 from .config import (
 from .config import (
     stderr,
     stderr,
@@ -745,7 +750,6 @@ def list_all(filter_patterns_str: Optional[str]=None,
     elif filter_patterns_str:
     elif filter_patterns_str:
         filter_patterns = filter_patterns_str.split('\n')
         filter_patterns = filter_patterns_str.split('\n')
 
 
-
     snapshots = list_links(
     snapshots = list_links(
         filter_patterns=filter_patterns,
         filter_patterns=filter_patterns,
         filter_type=filter_type,
         filter_type=filter_type,
@@ -761,8 +765,16 @@ def list_all(filter_patterns_str: Optional[str]=None,
         status=status,
         status=status,
         out_dir=out_dir,
         out_dir=out_dir,
     )
     )
-    
-    print(printable_folders(folders, json=json, csv=csv, html=html, with_headers=with_headers))
+
+    if json: 
+        output = generate_json_index_from_links(folders.values(), with_headers)
+    elif html:
+        output = generate_index_from_links(folders.values(), with_headers)
+    elif csv:
+        output = links_to_csv(folders.values(), cols=csv.split(','), header=with_headers)
+    else:
+        output = printable_folders(folders, with_headers=with_headers)
+    print(output)
     return folders
     return folders