1 год назад · d0fefc0279
--- a/archivebox/core/migrations/0027_update_snapshot_ids.py
+++ b/archivebox/core/migrations/0027_update_snapshot_ids.py
@@ -52,7 +52,7 @@ def update_snapshot_ids(apps, schema_editor):
 
				     Snapshot = apps.get_model("core", "Snapshot")
			
 
				     num_total = Snapshot.objects.all().count()
			
 
				     print(f'   Updating {num_total} Snapshot.id, Snapshot.uuid values in place...')
			
 
				-    for idx, snapshot in enumerate(Snapshot.objects.all().only('abid').iterator()):
			
 
				+    for idx, snapshot in enumerate(Snapshot.objects.all().only('abid').iterator(chunk_size=500)):
			
 
				         assert snapshot.abid
			
 
				         snapshot.abid_prefix = 'snp_'
			
 
				         snapshot.abid_ts_src = 'self.added'
			
@@ -72,7 +72,7 @@ def update_archiveresult_ids(apps, schema_editor):
 
				     ArchiveResult = apps.get_model("core", "ArchiveResult")
			
 
				     num_total = ArchiveResult.objects.all().count()
			
 
				     print(f'   Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)')
			
 
				-    for idx, result in enumerate(ArchiveResult.objects.all().only('abid', 'snapshot_id').iterator()):
			
 
				+    for idx, result in enumerate(ArchiveResult.objects.all().only('abid', 'snapshot_id').iterator(chunk_size=500)):
			
 
				         assert result.abid
			
 
				         result.abid_prefix = 'res_'
			
 
				         result.snapshot = Snapshot.objects.get(pk=result.snapshot_id)
			
--- a/archivebox/core/migrations/0034_alter_archiveresult_old_id_alter_archiveresult_uuid.py
+++ b/archivebox/core/migrations/0034_alter_archiveresult_old_id_alter_archiveresult_uuid.py
@@ -11,7 +11,7 @@ def update_archiveresult_ids(apps, schema_editor):
 
				     ArchiveResult = apps.get_model("core", "ArchiveResult")
			
 
				     num_total = ArchiveResult.objects.all().count()
			
 
				     print(f'   Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)')
			
 
				-    for idx, result in enumerate(ArchiveResult.objects.all().only('abid').iterator()):
			
 
				+    for idx, result in enumerate(ArchiveResult.objects.all().only('abid').iterator(chunk_size=500)):
			
 
				         assert result.abid
			
 
				         result.uuid = ABID.parse(result.abid).uuid
			
 
				         result.save(update_fields=["uuid"])
			
--- a/archivebox/core/migrations/0051_snapshottag_snapshot_alter_snapshottag_snapshot_old.py
+++ b/archivebox/core/migrations/0051_snapshottag_snapshot_alter_snapshottag_snapshot_old.py
@@ -9,7 +9,7 @@ def update_snapshottag_ids(apps, schema_editor):
 
				     SnapshotTag = apps.get_model("core", "SnapshotTag")
			
 
				     num_total = SnapshotTag.objects.all().count()
			
 
				     print(f'   Updating {num_total} SnapshotTag.snapshot_id values in place... (may take an hour or longer for large collections...)')
			
 
				-    for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('snapshot_old_id').iterator()):
			
 
				+    for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('snapshot_old_id').iterator(chunk_size=500)):
			
 
				         assert snapshottag.snapshot_old_id
			
 
				         snapshot = Snapshot.objects.get(old_id=snapshottag.snapshot_old_id)
			
 
				         snapshottag.snapshot_id = snapshot.id
			
--- a/archivebox/core/migrations/0059_tag_id.py
+++ b/archivebox/core/migrations/0059_tag_id.py
@@ -49,7 +49,7 @@ def update_archiveresult_ids(apps, schema_editor):
 
				     Tag = apps.get_model("core", "Tag")
			
 
				     num_total = Tag.objects.all().count()
			
 
				     print(f'   Updating {num_total} Tag.id, ArchiveResult.uuid values in place...')
			
 
				-    for idx, tag in enumerate(Tag.objects.all().iterator()):
			
 
				+    for idx, tag in enumerate(Tag.objects.all().iterator(chunk_size=500)):
			
 
				         if not tag.slug:
			
 
				             tag.slug = tag.name.lower().replace(' ', '_')
			
 
				         if not tag.name:
			
--- a/archivebox/core/migrations/0063_snapshottag_tag_alter_snapshottag_old_tag.py
+++ b/archivebox/core/migrations/0063_snapshottag_tag_alter_snapshottag_old_tag.py
@@ -9,7 +9,7 @@ def update_snapshottag_ids(apps, schema_editor):
 
				     SnapshotTag = apps.get_model("core", "SnapshotTag")
			
 
				     num_total = SnapshotTag.objects.all().count()
			
 
				     print(f'   Updating {num_total} SnapshotTag.tag_id values in place... (may take an hour or longer for large collections...)')
			
 
				-    for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('old_tag_id').iterator()):
			
 
				+    for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('old_tag_id').iterator(chunk_size=500)):
			
 
				         assert snapshottag.old_tag_id
			
 
				         tag = Tag.objects.get(old_id=snapshottag.old_tag_id)
			
 
				         snapshottag.tag_id = tag.id
			
--- a/archivebox/extractors/__init__.py
+++ b/archivebox/extractors/__init__.py
@@ -218,7 +218,7 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa
 
				     if type(all_links) is QuerySet:
			
 
				         num_links: int = all_links.count()
			
 
				         get_link = lambda x: x.as_link_with_details()
			
 
				-        all_links = all_links.iterator()
			
 
				+        all_links = all_links.iterator(chunk_size=500)
			
 
				     else:
			
 
				         num_links: int = len(all_links)
			
 
				         get_link = lambda x: x
			
--- a/archivebox/index/__init__.py
+++ b/archivebox/index/__init__.py
@@ -407,7 +407,7 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
 
				 
			
 
				 def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
			
 
				     """indexed links without checking archive status or data directory validity"""
			
 
				-    links = (snapshot.as_link() for snapshot in snapshots.iterator())
			
 
				+    links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
			
 
				     return {
			
 
				         link.link_dir: link
			
 
				         for link in links
			
@@ -415,7 +415,7 @@ def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
 
				 
			
 
				 def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
			
 
				     """indexed links that are archived with a valid data directory"""
			
 
				-    links = (snapshot.as_link() for snapshot in snapshots.iterator())
			
 
				+    links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
			
 
				     return {
			
 
				         link.link_dir: link
			
 
				         for link in filter(is_archived, links)
			
@@ -423,7 +423,7 @@ def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
 
				 
			
 
				 def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
			
 
				     """indexed links that are unarchived with no data directory or an empty data directory"""
			
 
				-    links = (snapshot.as_link() for snapshot in snapshots.iterator())
			
 
				+    links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
			
 
				     return {
			
 
				         link.link_dir: link
			
 
				         for link in filter(is_unarchived, links)
			
@@ -448,7 +448,7 @@ def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
 
				 
			
 
				 def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
			
 
				     """dirs with a valid index matched to the main index and archived content"""
			
 
				-    links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
			
 
				+    links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator(chunk_size=500)]
			
 
				     return {
			
 
				         link.link_dir: link
			
 
				         for link in filter(is_valid, links)
			
@@ -475,7 +475,7 @@ def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti
 
				             if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists()
			
 
				     )
			
 
				 
			
 
				-    for path in chain(snapshots.iterator(), data_folders):
			
 
				+    for path in chain(snapshots.iterator(chunk_size=500), data_folders):
			
 
				         link = None
			
 
				         if type(path) is not str:
			
 
				             path = path.as_link().link_dir
			
@@ -518,7 +518,7 @@ def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
 
				 def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
			
 
				     """dirs that don't contain a valid index and aren't listed in the main index"""
			
 
				     corrupted = {}
			
 
				-    for snapshot in snapshots.iterator():
			
 
				+    for snapshot in snapshots.iterator(chunk_size=500):
			
 
				         link = snapshot.as_link()
			
 
				         if is_corrupt(link):
			
 
				             corrupted[link.link_dir] = link