|
|
@@ -67,16 +67,15 @@ def ignore_methods(to_ignore: List[str]):
|
|
|
return list(methods)
|
|
|
|
|
|
@enforce_types
|
|
|
-def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None, skip_index: bool=False) -> Link:
|
|
|
+def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Link:
|
|
|
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
|
|
|
|
|
|
# TODO: Remove when the input is changed to be a snapshot. Suboptimal approach.
|
|
|
- if not skip_index:
|
|
|
- from core.models import Snapshot, ArchiveResult
|
|
|
- try:
|
|
|
- snapshot = Snapshot.objects.get(url=link.url) # TODO: This will be unnecessary once everything is a snapshot
|
|
|
- except Snapshot.DoesNotExist:
|
|
|
- snapshot = write_link_to_sql_index(link)
|
|
|
+ from core.models import Snapshot, ArchiveResult
|
|
|
+ try:
|
|
|
+ snapshot = Snapshot.objects.get(url=link.url) # TODO: This will be unnecessary once everything is a snapshot
|
|
|
+ except Snapshot.DoesNotExist:
|
|
|
+ snapshot = write_link_to_sql_index(link)
|
|
|
|
|
|
ARCHIVE_METHODS = get_default_archive_methods()
|
|
|
|
|
|
@@ -93,7 +92,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
|
|
|
os.makedirs(out_dir)
|
|
|
|
|
|
link = load_link_details(link, out_dir=out_dir)
|
|
|
- write_link_details(link, out_dir=out_dir, skip_sql_index=skip_index)
|
|
|
+ write_link_details(link, out_dir=out_dir, skip_sql_index=False)
|
|
|
log_link_archiving_started(link, out_dir, is_new)
|
|
|
link = link.overwrite(updated=datetime.now())
|
|
|
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
|
|
|
@@ -112,9 +111,8 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
|
|
|
|
|
|
stats[result.status] += 1
|
|
|
log_archive_method_finished(result)
|
|
|
- if not skip_index:
|
|
|
- write_search_index(link=link, texts=result.index_texts)
|
|
|
- ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,
|
|
|
+ write_search_index(link=link, texts=result.index_texts)
|
|
|
+ ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,
|
|
|
output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status)
|
|
|
|
|
|
else:
|
|
|
@@ -135,7 +133,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
|
|
|
except Exception:
|
|
|
pass
|
|
|
|
|
|
- write_link_details(link, out_dir=out_dir, skip_sql_index=skip_index)
|
|
|
+ write_link_details(link, out_dir=out_dir, skip_sql_index=False)
|
|
|
|
|
|
log_link_archiving_finished(link, link.link_dir, is_new, stats)
|
|
|
|