Browse Source

merge seeds and crawls apps

Nick Sweeting 1 year ago
parent
commit
65afd405b1

+ 0 - 1
archivebox/__init__.py

@@ -85,7 +85,6 @@ ARCHIVEBOX_BUILTIN_PLUGINS = {
     'workers': PACKAGE_DIR / 'workers',
     'workers': PACKAGE_DIR / 'workers',
     'core': PACKAGE_DIR / 'core',
     'core': PACKAGE_DIR / 'core',
     'crawls': PACKAGE_DIR / 'crawls',
     'crawls': PACKAGE_DIR / 'crawls',
-    'seeds': PACKAGE_DIR / 'seeds',
     # 'search': PACKAGE_DIR / 'search',
     # 'search': PACKAGE_DIR / 'search',
     # 'core': PACKAGE_DIR / 'core',
     # 'core': PACKAGE_DIR / 'core',
 }
 }

+ 3 - 4
archivebox/api/v1_crawls.py

@@ -10,8 +10,7 @@ from django.contrib.auth import get_user_model
 from ninja import Router, Schema
 from ninja import Router, Schema
 
 
 from core.models import Snapshot
 from core.models import Snapshot
-from crawls.models import Crawl
-from seeds.models import Seed
+from crawls.models import Seed, Crawl
 
 
 from .auth import API_AUTH_METHODS
 from .auth import API_AUTH_METHODS
 
 
@@ -19,7 +18,7 @@ router = Router(tags=['Crawl Models'], auth=API_AUTH_METHODS)
 
 
 
 
 class SeedSchema(Schema):
 class SeedSchema(Schema):
-    TYPE: str = 'seeds.models.Seed'
+    TYPE: str = 'crawls.models.Seed'
 
 
     id: UUID
     id: UUID
     abid: str
     abid: str
@@ -60,7 +59,7 @@ def get_seed(request, seed_id: str):
 
 
 
 
 class CrawlSchema(Schema):
 class CrawlSchema(Schema):
-    TYPE: str = 'core.models.Crawl'
+    TYPE: str = 'crawls.models.Crawl'
 
 
     id: UUID
     id: UUID
     abid: str
     abid: str

+ 1 - 2
archivebox/cli/archivebox_add.py

@@ -51,8 +51,7 @@ def add(urls: str | list[str],
     setup_django()
     setup_django()
     check_data_folder()
     check_data_folder()
     
     
-    from seeds.models import Seed
-    from crawls.models import Crawl
+    from crawls.models import Seed, Crawl
     from workers.orchestrator import Orchestrator
     from workers.orchestrator import Orchestrator
     from abid_utils.models import get_or_create_system_user_pk
     from abid_utils.models import get_or_create_system_user_pk
 
 

+ 1 - 2
archivebox/core/settings.py

@@ -65,8 +65,7 @@ INSTALLED_APPS = [
     'config',                    # ArchiveBox config settings (loaded as a plugin, don't need to add it here) 
     'config',                    # ArchiveBox config settings (loaded as a plugin, don't need to add it here) 
     'machine',                   # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
     'machine',                   # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
     'workers',                   # handles starting and managing background workers and processes (orchestrators and actors)
     'workers',                   # handles starting and managing background workers and processes (orchestrators and actors)
-    'seeds',                     # handles Seed model and URL source management
-    'crawls',                    # handles Crawl and CrawlSchedule models and management
+    'crawls',                    # handles Seed, Crawl, and CrawlSchedule models and management
     'personas',                  # handles Persona and session management
     'personas',                  # handles Persona and session management
     'core',                      # core django model with Snapshot, ArchiveResult, etc.
     'core',                      # core django model with Snapshot, ArchiveResult, etc.
     'api',                       # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
     'api',                       # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.

+ 55 - 4
archivebox/crawls/admin.py

@@ -1,7 +1,5 @@
 __package__ = 'archivebox.crawls'
 __package__ = 'archivebox.crawls'
 
 
-import abx
-
 from django.utils.html import format_html, format_html_join
 from django.utils.html import format_html, format_html_join
 from django.contrib import admin
 from django.contrib import admin
 
 
@@ -10,7 +8,59 @@ from archivebox import DATA_DIR
 from abid_utils.admin import ABIDModelAdmin
 from abid_utils.admin import ABIDModelAdmin
 
 
 from core.models import Snapshot
 from core.models import Snapshot
-from crawls.models import Crawl, CrawlSchedule
+from crawls.models import Seed, Crawl, CrawlSchedule
+
+
+class SeedAdmin(ABIDModelAdmin):
+    list_display = ('abid', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str', 'crawls', 'num_crawls', 'num_snapshots')
+    sort_fields = ('abid', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str')
+    search_fields = ('abid', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str')
+    
+    readonly_fields = ('created_at', 'modified_at', 'abid_info', 'scheduled_crawls', 'crawls', 'snapshots', 'contents')
+    fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'created_by', *readonly_fields)
+
+    list_filter = ('extractor', 'created_by')
+    ordering = ['-created_at']
+    list_per_page = 100
+    actions = ["delete_selected"]
+
+    def num_crawls(self, obj):
+        return obj.crawl_set.count()
+
+    def num_snapshots(self, obj):
+        return obj.snapshot_set.count()
+
+    def scheduled_crawls(self, obj):
+        return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
+            (scheduledcrawl.admin_change_url, scheduledcrawl)
+            for scheduledcrawl in  obj.scheduled_crawl_set.all().order_by('-created_at')[:20]
+        )) or format_html('<i>No Scheduled Crawls yet...</i>')
+
+    def crawls(self, obj):
+        return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
+            (crawl.admin_change_url, crawl)
+            for crawl in obj.crawl_set.all().order_by('-created_at')[:20]
+        )) or format_html('<i>No Crawls yet...</i>')
+
+    def snapshots(self, obj):
+        return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
+            (snapshot.admin_change_url, snapshot)
+            for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
+        )) or format_html('<i>No Snapshots yet...</i>')
+
+    def contents(self, obj):
+        if obj.uri.startswith('file:///data/'):
+            source_file = DATA_DIR / obj.uri.replace('file:///data/', '', 1)
+            contents = ""
+            try:
+                contents = source_file.read_text().strip()[:14_000]
+            except Exception as e:
+                contents = f'Error reading {source_file}: {e}'
+                
+            return format_html('<b><code>{}</code>:</b><br/><pre>{}</pre>', source_file, contents)
+        
+        return format_html('See URLs here: <a href="{}">{}</a>', obj.uri, obj.uri)
+
 
 
 
 
 
 
@@ -102,7 +152,8 @@ class CrawlScheduleAdmin(ABIDModelAdmin):
             for snapshot in Snapshot.objects.filter(crawl_id__in=crawl_ids).order_by('-created_at')[:20]
             for snapshot in Snapshot.objects.filter(crawl_id__in=crawl_ids).order_by('-created_at')[:20]
         )) or format_html('<i>No Snapshots yet...</i>')
         )) or format_html('<i>No Snapshots yet...</i>')
 
 
-@abx.hookimpl
+
 def register_admin(admin_site):
 def register_admin(admin_site):
+    admin_site.register(Seed, SeedAdmin)
     admin_site.register(Crawl, CrawlAdmin)
     admin_site.register(Crawl, CrawlAdmin)
     admin_site.register(CrawlSchedule, CrawlScheduleAdmin)
     admin_site.register(CrawlSchedule, CrawlScheduleAdmin)

+ 105 - 2
archivebox/crawls/models.py

@@ -1,6 +1,7 @@
 __package__ = 'archivebox.crawls'
 __package__ = 'archivebox.crawls'
 
 
 from typing import TYPE_CHECKING
 from typing import TYPE_CHECKING
+from pathlib import Path
 from django_stubs_ext.db.models import TypedModelMeta
 from django_stubs_ext.db.models import TypedModelMeta
 
 
 from django.db import models
 from django.db import models
@@ -12,12 +13,114 @@ from django.utils import timezone
 
 
 from workers.models import ModelWithStateMachine
 from workers.models import ModelWithStateMachine
 
 
+from archivebox.config import CONSTANTS
+from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats, get_or_create_system_user_pk
+
 if TYPE_CHECKING:
 if TYPE_CHECKING:
     from core.models import Snapshot, ArchiveResult
     from core.models import Snapshot, ArchiveResult
 
 
-from seeds.models import Seed
 
 
-from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats
+
+class Seed(ABIDModel, ModelWithHealthStats):
+    """
+    A fountain that produces URLs (+metadata) each time it's queried e.g.
+        - file:///data/sources/2024-01-02_11-57-51__cli_add.txt
+        - file:///data/sources/2024-01-02_11-57-51__web_ui_add.txt
+        - file:///Users/squash/Library/Application Support/Google/Chrome/Default/Bookmarks
+        - https://getpocket.com/user/nikisweeting/feed
+        - https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
+        - ...
+    Each query of a Seed can produce the same list of URLs, or a different list each time.
+    The list of URLs it returns is used to create a new Crawl and seed it with new pending Snapshots.
+        
+    When a crawl is created, a root_snapshot is initially created with a URI set to the Seed URI.
+    The seed's preferred extractor is executed on that URI, which produces an ArchiveResult containing outlinks.
+    The outlinks then get turned into new pending Snapshots under the same crawl,
+    and the cycle repeats until Crawl.max_depth.
+
+    Each consumption of a Seed by an Extractor can produce new urls, as Seeds can point to
+    stateful remote services, files with contents that change, directories that have new files within, etc.
+    """
+    
+    abid_prefix = 'src_'
+    abid_ts_src = 'self.created_at'
+    abid_uri_src = 'self.uri'
+    abid_subtype_src = 'self.extractor'
+    abid_rand_src = 'self.id'
+    abid_drift_allowed = True
+    
+    id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
+    abid = ABIDField(prefix=abid_prefix)
+    
+    uri = models.URLField(max_length=2000, blank=False, null=False)                          # unique source location where URLs will be loaded from
+    label = models.CharField(max_length=255, null=False, blank=True, default='', help_text='A human-readable label for this seed')
+    notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this seed should have')
+    
+    extractor = models.CharField(default='auto', max_length=32, help_text='The parser / extractor to use to load URLs from this source (default: auto)')
+    tags_str = models.CharField(max_length=255, null=False, blank=True, default='', help_text='An optional comma-separated list of tags to attach to any URLs that come from this source')
+    config = models.JSONField(default=dict, help_text='An optional JSON object containing extra config to put in scope when loading URLs from this source')
+    
+    created_at = AutoDateTimeField(default=None, null=False, db_index=True)
+    modified_at = models.DateTimeField(auto_now=True)
+    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
+
+
+    crawl_set: models.Manager['Crawl']
+
+    class Meta:
+        verbose_name = 'Seed'
+        verbose_name_plural = 'Seeds'
+        
+        unique_together = (('created_by', 'uri', 'extractor'),)
+
+
+    @classmethod
+    def from_file(cls, source_file: Path, label: str='', parser: str='auto', tag: str='', created_by: int|None=None, config: dict|None=None):
+        source_path = str(source_file.resolve()).replace(str(CONSTANTS.DATA_DIR), '/data')
+        
+        seed, _ = cls.objects.get_or_create(
+            label=label or source_file.name,
+            uri=f'file://{source_path}',
+            created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
+            extractor=parser,
+            tags_str=tag,
+            config=config or {},
+        )
+        seed.save()
+        return seed
+
+    @property
+    def source_type(self):
+        # e.g. http/https://
+        #      file://
+        #      pocketapi://
+        #      s3://
+        #      etc..
+        return self.uri.split('://', 1)[0].lower()
+
+    @property
+    def api_url(self) -> str:
+        # /api/v1/core/seed/{uulid}
+        return reverse_lazy('api-1:get_seed', args=[self.abid])  # + f'?api_key={get_or_create_api_token(request.user)}'
+
+    @property
+    def api_docs_url(self) -> str:
+        return '/api/v1/docs#/Core%20Models/api_v1_core_get_seed'
+
+    @property
+    def scheduled_crawl_set(self) -> QuerySet['CrawlSchedule']:
+        from crawls.models import CrawlSchedule
+        return CrawlSchedule.objects.filter(template__seed_id=self.pk)
+
+    @property
+    def snapshot_set(self) -> QuerySet['Snapshot']:
+        from core.models import Snapshot
+        
+        crawl_ids = self.crawl_set.values_list('pk', flat=True)
+        return Snapshot.objects.filter(crawl_id__in=crawl_ids)
+
+
+
 
 
 
 
 class CrawlSchedule(ABIDModel, ModelWithHealthStats):
 class CrawlSchedule(ABIDModel, ModelWithHealthStats):

+ 1 - 1
archivebox/search/__init__.py

@@ -34,7 +34,7 @@ def get_file_result_content(res, extra_path, use_pwd=False):
     return []
     return []
 
 
 
 
-# This should be abstracted by a plugin interface for extractors
+# TODO: This should be abstracted by a plugin interface for extractors
 @enforce_types
 @enforce_types
 def get_indexable_content(results: QuerySet):
 def get_indexable_content(results: QuerySet):
     if not results:
     if not results:

+ 2 - 1
archivebox/search/admin.py

@@ -1,10 +1,11 @@
 __package__ = 'archivebox.search'
 __package__ = 'archivebox.search'
 
 
 from django.contrib import messages
 from django.contrib import messages
+from django.contrib import admin
 
 
 from archivebox.search import query_search_index
 from archivebox.search import query_search_index
 
 
-class SearchResultsAdminMixin:
+class SearchResultsAdminMixin(admin.ModelAdmin):
     def get_search_results(self, request, queryset, search_term: str):
     def get_search_results(self, request, queryset, search_term: str):
         """Enhances the search queryset with results from the search backend"""
         """Enhances the search queryset with results from the search backend"""
         
         

+ 0 - 12
archivebox/seeds/__init__.py

@@ -1,12 +0,0 @@
-
-__package__ = 'archivebox.seeds'
-__order__ = 100
-
-import abx
-
-
[email protected]
-def register_admin(admin_site):
-    from .admin import register_admin as register_seeds_admin
-    register_seeds_admin(admin_site)
-

+ 0 - 68
archivebox/seeds/admin.py

@@ -1,68 +0,0 @@
-__package__ = 'archivebox.seeds'
-
-import abx
-
-from django.utils.html import format_html_join, format_html
-
-from abid_utils.admin import ABIDModelAdmin
-
-from archivebox import DATA_DIR
-
-from seeds.models import Seed
-
-
-
-class SeedAdmin(ABIDModelAdmin):
-    list_display = ('abid', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str', 'crawls', 'num_crawls', 'num_snapshots')
-    sort_fields = ('abid', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str')
-    search_fields = ('abid', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str')
-    
-    readonly_fields = ('created_at', 'modified_at', 'abid_info', 'scheduled_crawls', 'crawls', 'snapshots', 'contents')
-    fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'created_by', *readonly_fields)
-
-    list_filter = ('extractor', 'created_by')
-    ordering = ['-created_at']
-    list_per_page = 100
-    actions = ["delete_selected"]
-
-    def num_crawls(self, obj):
-        return obj.crawl_set.count()
-
-    def num_snapshots(self, obj):
-        return obj.snapshot_set.count()
-
-    def scheduled_crawls(self, obj):
-        return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
-            (scheduledcrawl.admin_change_url, scheduledcrawl)
-            for scheduledcrawl in  obj.scheduled_crawl_set.all().order_by('-created_at')[:20]
-        )) or format_html('<i>No Scheduled Crawls yet...</i>')
-
-    def crawls(self, obj):
-        return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
-            (crawl.admin_change_url, crawl)
-            for crawl in obj.crawl_set.all().order_by('-created_at')[:20]
-        )) or format_html('<i>No Crawls yet...</i>')
-
-    def snapshots(self, obj):
-        return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
-            (snapshot.admin_change_url, snapshot)
-            for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
-        )) or format_html('<i>No Snapshots yet...</i>')
-
-    def contents(self, obj):
-        if obj.uri.startswith('file:///data/'):
-            source_file = DATA_DIR / obj.uri.replace('file:///data/', '', 1)
-            contents = ""
-            try:
-                contents = source_file.read_text().strip()[:14_000]
-            except Exception as e:
-                contents = f'Error reading {source_file}: {e}'
-                
-            return format_html('<b><code>{}</code>:</b><br/><pre>{}</pre>', source_file, contents)
-        
-        return format_html('See URLs here: <a href="{}">{}</a>', obj.uri, obj.uri)
-
-
[email protected]
-def register_admin(admin_site):
-    admin_site.register(Seed, SeedAdmin)

+ 0 - 6
archivebox/seeds/apps.py

@@ -1,6 +0,0 @@
-from django.apps import AppConfig
-
-
-class SeedsConfig(AppConfig):
-    default_auto_field = "django.db.models.BigAutoField"
-    name = "seeds"

+ 0 - 0
archivebox/seeds/migrations/__init__.py


+ 0 - 115
archivebox/seeds/models.py

@@ -1,115 +0,0 @@
-__package__ = 'archivebox.seeds'
-
-from typing import TYPE_CHECKING
-from pathlib import Path
-
-from django.db import models
-from django.db.models import QuerySet
-from django.conf import settings
-from django.urls import reverse_lazy
-
-from archivebox.config import CONSTANTS
-from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats, get_or_create_system_user_pk
-
-if TYPE_CHECKING:
-    from crawls.models import Crawl, CrawlSchedule
-    from core.models import Snapshot
-
-
-class Seed(ABIDModel, ModelWithHealthStats):
-    """
-    A fountain that produces URLs (+metadata) each time it's queried e.g.
-        - file:///data/sources/2024-01-02_11-57-51__cli_add.txt
-        - file:///data/sources/2024-01-02_11-57-51__web_ui_add.txt
-        - file:///Users/squash/Library/Application Support/Google/Chrome/Default/Bookmarks
-        - https://getpocket.com/user/nikisweeting/feed
-        - https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
-        - ...
-    Each query of a Seed can produce the same list of URLs, or a different list each time.
-    The list of URLs it returns is used to create a new Crawl and seed it with new pending Snapshots.
-        
-    When a crawl is created, a root_snapshot is initially created with a URI set to the Seed URI.
-    The seed's preferred extractor is executed on that URI, which produces an ArchiveResult containing outlinks.
-    The outlinks then get turned into new pending Snapshots under the same crawl,
-    and the cycle repeats until Crawl.max_depth.
-
-    Each consumption of a Seed by an Extractor can produce new urls, as Seeds can point to
-    stateful remote services, files with contents that change, directories that have new files within, etc.
-    """
-    
-    abid_prefix = 'src_'
-    abid_ts_src = 'self.created_at'
-    abid_uri_src = 'self.uri'
-    abid_subtype_src = 'self.extractor'
-    abid_rand_src = 'self.id'
-    abid_drift_allowed = True
-    
-    id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
-    abid = ABIDField(prefix=abid_prefix)
-    
-    uri = models.URLField(max_length=2000, blank=False, null=False)                          # unique source location where URLs will be loaded from
-    label = models.CharField(max_length=255, null=False, blank=True, default='', help_text='A human-readable label for this seed')
-    notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this seed should have')
-    
-    extractor = models.CharField(default='auto', max_length=32, help_text='The parser / extractor to use to load URLs from this source (default: auto)')
-    tags_str = models.CharField(max_length=255, null=False, blank=True, default='', help_text='An optional comma-separated list of tags to attach to any URLs that come from this source')
-    config = models.JSONField(default=dict, help_text='An optional JSON object containing extra config to put in scope when loading URLs from this source')
-    
-    created_at = AutoDateTimeField(default=None, null=False, db_index=True)
-    modified_at = models.DateTimeField(auto_now=True)
-    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
-
-
-    crawl_set: models.Manager['Crawl']
-
-    class Meta:
-        verbose_name = 'Seed'
-        verbose_name_plural = 'Seeds'
-        
-        unique_together = (('created_by', 'uri', 'extractor'),)
-
-
-    @classmethod
-    def from_file(cls, source_file: Path, label: str='', parser: str='auto', tag: str='', created_by: int|None=None, config: dict|None=None):
-        source_path = str(source_file.resolve()).replace(str(CONSTANTS.DATA_DIR), '/data')
-        
-        seed, _ = cls.objects.get_or_create(
-            label=label or source_file.name,
-            uri=f'file://{source_path}',
-            created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
-            extractor=parser,
-            tags_str=tag,
-            config=config or {},
-        )
-        seed.save()
-        return seed
-
-    @property
-    def source_type(self):
-        # e.g. http/https://
-        #      file://
-        #      pocketapi://
-        #      s3://
-        #      etc..
-        return self.uri.split('://', 1)[0].lower()
-
-    @property
-    def api_url(self) -> str:
-        # /api/v1/core/seed/{uulid}
-        return reverse_lazy('api-1:get_seed', args=[self.abid])  # + f'?api_key={get_or_create_api_token(request.user)}'
-
-    @property
-    def api_docs_url(self) -> str:
-        return '/api/v1/docs#/Core%20Models/api_v1_core_get_seed'
-
-    @property
-    def scheduled_crawl_set(self) -> QuerySet['CrawlSchedule']:
-        from crawls.models import CrawlSchedule
-        return CrawlSchedule.objects.filter(template__seed_id=self.pk)
-
-    @property
-    def snapshot_set(self) -> QuerySet['Snapshot']:
-        from core.models import Snapshot
-        
-        crawl_ids = self.crawl_set.values_list('pk', flat=True)
-        return Snapshot.objects.filter(crawl_id__in=crawl_ids)

+ 0 - 3
archivebox/seeds/tests.py

@@ -1,3 +0,0 @@
-from django.test import TestCase
-
-# Create your tests here.

+ 0 - 3
archivebox/seeds/views.py

@@ -1,3 +0,0 @@
-from django.shortcuts import render
-
-# Create your views here.