1 year ago · 65afd405b1
--- a/archivebox/__init__.py
+++ b/archivebox/__init__.py
@@ -85,7 +85,6 @@ ARCHIVEBOX_BUILTIN_PLUGINS = {
 
															     'workers': PACKAGE_DIR / 'workers',
														
 
															     'core': PACKAGE_DIR / 'core',
														
 
															     'crawls': PACKAGE_DIR / 'crawls',
														
 
															-    'seeds': PACKAGE_DIR / 'seeds',
														
 
															     # 'search': PACKAGE_DIR / 'search',
														
 
															     # 'core': PACKAGE_DIR / 'core',
														
 
															 }
														
--- a/archivebox/api/v1_crawls.py
+++ b/archivebox/api/v1_crawls.py
@@ -10,8 +10,7 @@ from django.contrib.auth import get_user_model
 
															 from ninja import Router, Schema
														
 
															 from core.models import Snapshot
														
 
															-from crawls.models import Crawl
														
 
															-from seeds.models import Seed
														
 
															+from crawls.models import Seed, Crawl
														
 
															 from .auth import API_AUTH_METHODS
														
@@ -19,7 +18,7 @@ router = Router(tags=['Crawl Models'], auth=API_AUTH_METHODS)
 
															 class SeedSchema(Schema):
														
 
															-    TYPE: str = 'seeds.models.Seed'
														
 
															+    TYPE: str = 'crawls.models.Seed'
														
 
															     id: UUID
														
 
															     abid: str
														
@@ -60,7 +59,7 @@ def get_seed(request, seed_id: str):
 
															 class CrawlSchema(Schema):
														
 
															-    TYPE: str = 'core.models.Crawl'
														
 
															+    TYPE: str = 'crawls.models.Crawl'
														
 
															     id: UUID
														
 
															     abid: str
														
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -51,8 +51,7 @@ def add(urls: str | list[str],
 
															     setup_django()
														
 
															     check_data_folder()
														
 
															-    from seeds.models import Seed
														
 
															-    from crawls.models import Crawl
														
 
															+    from crawls.models import Seed, Crawl
														
 
															     from workers.orchestrator import Orchestrator
														
 
															     from abid_utils.models import get_or_create_system_user_pk
														
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -65,8 +65,7 @@ INSTALLED_APPS = [
 
															     'config',                    # ArchiveBox config settings (loaded as a plugin, don't need to add it here) 
														
 
															     'machine',                   # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
														
 
															     'workers',                   # handles starting and managing background workers and processes (orchestrators and actors)
														
 
															-    'seeds',                     # handles Seed model and URL source management
														
 
															-    'crawls',                    # handles Crawl and CrawlSchedule models and management
														
 
															+    'crawls',                    # handles Seed, Crawl, and CrawlSchedule models and management
														
 
															     'personas',                  # handles Persona and session management
														
 
															     'core',                      # core django model with Snapshot, ArchiveResult, etc.
														
 
															     'api',                       # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
														
--- a/archivebox/crawls/admin.py
+++ b/archivebox/crawls/admin.py
@@ -1,7 +1,5 @@
 
															 __package__ = 'archivebox.crawls'
														
 
															-import abx
														
 
															-
														
 
															 from django.utils.html import format_html, format_html_join
														
 
															 from django.contrib import admin
														
@@ -10,7 +8,59 @@ from archivebox import DATA_DIR
 
															 from abid_utils.admin import ABIDModelAdmin
														
 
															 from core.models import Snapshot
														
 
															-from crawls.models import Crawl, CrawlSchedule
														
 
															+from crawls.models import Seed, Crawl, CrawlSchedule
														
 
															+
														
 
															+
														
 
															+class SeedAdmin(ABIDModelAdmin):
														
 
															+    list_display = ('abid', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str', 'crawls', 'num_crawls', 'num_snapshots')
														
 
															+    sort_fields = ('abid', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str')
														
 
															+    search_fields = ('abid', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str')
														
 
															+    
														
 
															+    readonly_fields = ('created_at', 'modified_at', 'abid_info', 'scheduled_crawls', 'crawls', 'snapshots', 'contents')
														
 
															+    fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'created_by', *readonly_fields)
														
 
															+
														
 
															+    list_filter = ('extractor', 'created_by')
														
 
															+    ordering = ['-created_at']
														
 
															+    list_per_page = 100
														
 
															+    actions = ["delete_selected"]
														
 
															+
														
 
															+    def num_crawls(self, obj):
														
 
															+        return obj.crawl_set.count()
														
 
															+
														
 
															+    def num_snapshots(self, obj):
														
 
															+        return obj.snapshot_set.count()
														
 
															+
														
 
															+    def scheduled_crawls(self, obj):
														
 
															+        return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
														
 
															+            (scheduledcrawl.admin_change_url, scheduledcrawl)
														
 
															+            for scheduledcrawl in  obj.scheduled_crawl_set.all().order_by('-created_at')[:20]
														
 
															+        )) or format_html('<i>No Scheduled Crawls yet...</i>')
														
 
															+
														
 
															+    def crawls(self, obj):
														
 
															+        return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
														
 
															+            (crawl.admin_change_url, crawl)
														
 
															+            for crawl in obj.crawl_set.all().order_by('-created_at')[:20]
														
 
															+        )) or format_html('<i>No Crawls yet...</i>')
														
 
															+
														
 
															+    def snapshots(self, obj):
														
 
															+        return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
														
 
															+            (snapshot.admin_change_url, snapshot)
														
 
															+            for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
														
 
															+        )) or format_html('<i>No Snapshots yet...</i>')
														
 
															+
														
 
															+    def contents(self, obj):
														
 
															+        if obj.uri.startswith('file:///data/'):
														
 
															+            source_file = DATA_DIR / obj.uri.replace('file:///data/', '', 1)
														
 
															+            contents = ""
														
 
															+            try:
														
 
															+                contents = source_file.read_text().strip()[:14_000]
														
 
															+            except Exception as e:
														
 
															+                contents = f'Error reading {source_file}: {e}'
														
 
															+                
														
 
															+            return format_html('<b><code>{}</code>:</b><br/><pre>{}</pre>', source_file, contents)
														
 
															+        
														
 
															+        return format_html('See URLs here: <a href="{}">{}</a>', obj.uri, obj.uri)
														
 
															+
														
@@ -102,7 +152,8 @@ class CrawlScheduleAdmin(ABIDModelAdmin):
 
															             for snapshot in Snapshot.objects.filter(crawl_id__in=crawl_ids).order_by('-created_at')[:20]
														
 
															         )) or format_html('<i>No Snapshots yet...</i>')
														
 
															-@abx.hookimpl
														
 
															+
														
 
															 def register_admin(admin_site):
														
 
															+    admin_site.register(Seed, SeedAdmin)
														
 
															     admin_site.register(Crawl, CrawlAdmin)
														
 
															     admin_site.register(CrawlSchedule, CrawlScheduleAdmin)
														
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -1,6 +1,7 @@
 
															 __package__ = 'archivebox.crawls'
														
 
															 from typing import TYPE_CHECKING
														
 
															+from pathlib import Path
														
 
															 from django_stubs_ext.db.models import TypedModelMeta
														
 
															 from django.db import models
														
@@ -12,12 +13,114 @@ from django.utils import timezone
 
															 from workers.models import ModelWithStateMachine
														
 
															+from archivebox.config import CONSTANTS
														
 
															+from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats, get_or_create_system_user_pk
														
 
															+
														
 
															 if TYPE_CHECKING:
														
 
															     from core.models import Snapshot, ArchiveResult
														
 
															-from seeds.models import Seed
														
 
															-from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats
														
 
															+
														
 
															+class Seed(ABIDModel, ModelWithHealthStats):
														
 
															+    """
														
 
															+    A fountain that produces URLs (+metadata) each time it's queried e.g.
														
 
															+        - file:///data/sources/2024-01-02_11-57-51__cli_add.txt
														
 
															+        - file:///data/sources/2024-01-02_11-57-51__web_ui_add.txt
														
 
															+        - file:///Users/squash/Library/Application Support/Google/Chrome/Default/Bookmarks
														
 
															+        - https://getpocket.com/user/nikisweeting/feed
														
 
															+        - https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
														
 
															+        - ...
														
 
															+    Each query of a Seed can produce the same list of URLs, or a different list each time.
														
 
															+    The list of URLs it returns is used to create a new Crawl and seed it with new pending Snapshots.
														
 
															+        
														
 
															+    When a crawl is created, a root_snapshot is initially created with a URI set to the Seed URI.
														
 
															+    The seed's preferred extractor is executed on that URI, which produces an ArchiveResult containing outlinks.
														
 
															+    The outlinks then get turned into new pending Snapshots under the same crawl,
														
 
															+    and the cycle repeats until Crawl.max_depth.
														
 
															+
														
 
															+    Each consumption of a Seed by an Extractor can produce new urls, as Seeds can point to
														
 
															+    stateful remote services, files with contents that change, directories that have new files within, etc.
														
 
															+    """
														
 
															+    
														
 
															+    abid_prefix = 'src_'
														
 
															+    abid_ts_src = 'self.created_at'
														
 
															+    abid_uri_src = 'self.uri'
														
 
															+    abid_subtype_src = 'self.extractor'
														
 
															+    abid_rand_src = 'self.id'
														
 
															+    abid_drift_allowed = True
														
 
															+    
														
 
															+    id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
														
 
															+    abid = ABIDField(prefix=abid_prefix)
														
 
															+    
														
 
															+    uri = models.URLField(max_length=2000, blank=False, null=False)                          # unique source location where URLs will be loaded from
														
 
															+    label = models.CharField(max_length=255, null=False, blank=True, default='', help_text='A human-readable label for this seed')
														
 
															+    notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this seed should have')
														
 
															+    
														
 
															+    extractor = models.CharField(default='auto', max_length=32, help_text='The parser / extractor to use to load URLs from this source (default: auto)')
														
 
															+    tags_str = models.CharField(max_length=255, null=False, blank=True, default='', help_text='An optional comma-separated list of tags to attach to any URLs that come from this source')
														
 
															+    config = models.JSONField(default=dict, help_text='An optional JSON object containing extra config to put in scope when loading URLs from this source')
														
 
															+    
														
 
															+    created_at = AutoDateTimeField(default=None, null=False, db_index=True)
														
 
															+    modified_at = models.DateTimeField(auto_now=True)
														
 
															+    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
														
 
															+
														
 
															+
														
 
															+    crawl_set: models.Manager['Crawl']
														
 
															+
														
 
															+    class Meta:
														
 
															+        verbose_name = 'Seed'
														
 
															+        verbose_name_plural = 'Seeds'
														
 
															+        
														
 
															+        unique_together = (('created_by', 'uri', 'extractor'),)
														
 
															+
														
 
															+
														
 
															+    @classmethod
														
 
															+    def from_file(cls, source_file: Path, label: str='', parser: str='auto', tag: str='', created_by: int|None=None, config: dict|None=None):
														
 
															+        source_path = str(source_file.resolve()).replace(str(CONSTANTS.DATA_DIR), '/data')
														
 
															+        
														
 
															+        seed, _ = cls.objects.get_or_create(
														
 
															+            label=label or source_file.name,
														
 
															+            uri=f'file://{source_path}',
														
 
															+            created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
														
 
															+            extractor=parser,
														
 
															+            tags_str=tag,
														
 
															+            config=config or {},
														
 
															+        )
														
 
															+        seed.save()
														
 
															+        return seed
														
 
															+
														
 
															+    @property
														
 
															+    def source_type(self):
														
 
															+        # e.g. http/https://
														
 
															+        #      file://
														
 
															+        #      pocketapi://
														
 
															+        #      s3://
														
 
															+        #      etc..
														
 
															+        return self.uri.split('://', 1)[0].lower()
														
 
															+
														
 
															+    @property
														
 
															+    def api_url(self) -> str:
														
 
															+        # /api/v1/core/seed/{uulid}
														
 
															+        return reverse_lazy('api-1:get_seed', args=[self.abid])  # + f'?api_key={get_or_create_api_token(request.user)}'
														
 
															+
														
 
															+    @property
														
 
															+    def api_docs_url(self) -> str:
														
 
															+        return '/api/v1/docs#/Core%20Models/api_v1_core_get_seed'
														
 
															+
														
 
															+    @property
														
 
															+    def scheduled_crawl_set(self) -> QuerySet['CrawlSchedule']:
														
 
															+        from crawls.models import CrawlSchedule
														
 
															+        return CrawlSchedule.objects.filter(template__seed_id=self.pk)
														
 
															+
														
 
															+    @property
														
 
															+    def snapshot_set(self) -> QuerySet['Snapshot']:
														
 
															+        from core.models import Snapshot
														
 
															+        
														
 
															+        crawl_ids = self.crawl_set.values_list('pk', flat=True)
														
 
															+        return Snapshot.objects.filter(crawl_id__in=crawl_ids)
														
 
															+
														
 
															+
														
 
															+
														
 
															 class CrawlSchedule(ABIDModel, ModelWithHealthStats):
														
--- a/archivebox/search/__init__.py
+++ b/archivebox/search/__init__.py
@@ -34,7 +34,7 @@ def get_file_result_content(res, extra_path, use_pwd=False):
 
															     return []
														
 
															-# This should be abstracted by a plugin interface for extractors
														
 
															+# TODO: This should be abstracted by a plugin interface for extractors
														
 
															 @enforce_types
														
 
															 def get_indexable_content(results: QuerySet):
														
 
															     if not results:
														
--- a/archivebox/search/admin.py
+++ b/archivebox/search/admin.py
@@ -1,10 +1,11 @@
 
															 __package__ = 'archivebox.search'
														
 
															 from django.contrib import messages
														
 
															+from django.contrib import admin
														
 
															 from archivebox.search import query_search_index
														
 
															-class SearchResultsAdminMixin:
														
 
															+class SearchResultsAdminMixin(admin.ModelAdmin):
														
 
															     def get_search_results(self, request, queryset, search_term: str):
														
 
															         """Enhances the search queryset with results from the search backend"""
														
--- a/archivebox/seeds/__init__.py
+++ b/archivebox/seeds/__init__.py
@@ -1,12 +0,0 @@
 
															-
														
 
															-__package__ = 'archivebox.seeds'
														
 
															-__order__ = 100
														
 
															-
														
 
															-import abx
														
 
															-
														
 
															-
														
 
															[email protected]
														
 
															-def register_admin(admin_site):
														
 
															-    from .admin import register_admin as register_seeds_admin
														
 
															-    register_seeds_admin(admin_site)
														
 
															-
														
--- a/archivebox/seeds/admin.py
+++ b/archivebox/seeds/admin.py
@@ -1,68 +0,0 @@
 
															-__package__ = 'archivebox.seeds'
														
 
															-
														
 
															-import abx
														
 
															-
														
 
															-from django.utils.html import format_html_join, format_html
														
 
															-
														
 
															-from abid_utils.admin import ABIDModelAdmin
														
 
															-
														
 
															-from archivebox import DATA_DIR
														
 
															-
														
 
															-from seeds.models import Seed
														
 
															-
														
 
															-
														
 
															-
														
 
															-class SeedAdmin(ABIDModelAdmin):
														
 
															-    list_display = ('abid', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str', 'crawls', 'num_crawls', 'num_snapshots')
														
 
															-    sort_fields = ('abid', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str')
														
 
															-    search_fields = ('abid', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str')
														
 
															-    
														
 
															-    readonly_fields = ('created_at', 'modified_at', 'abid_info', 'scheduled_crawls', 'crawls', 'snapshots', 'contents')
														
 
															-    fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'created_by', *readonly_fields)
														
 
															-
														
 
															-    list_filter = ('extractor', 'created_by')
														
 
															-    ordering = ['-created_at']
														
 
															-    list_per_page = 100
														
 
															-    actions = ["delete_selected"]
														
 
															-
														
 
															-    def num_crawls(self, obj):
														
 
															-        return obj.crawl_set.count()
														
 
															-
														
 
															-    def num_snapshots(self, obj):
														
 
															-        return obj.snapshot_set.count()
														
 
															-
														
 
															-    def scheduled_crawls(self, obj):
														
 
															-        return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
														
 
															-            (scheduledcrawl.admin_change_url, scheduledcrawl)
														
 
															-            for scheduledcrawl in  obj.scheduled_crawl_set.all().order_by('-created_at')[:20]
														
 
															-        )) or format_html('<i>No Scheduled Crawls yet...</i>')
														
 
															-
														
 
															-    def crawls(self, obj):
														
 
															-        return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
														
 
															-            (crawl.admin_change_url, crawl)
														
 
															-            for crawl in obj.crawl_set.all().order_by('-created_at')[:20]
														
 
															-        )) or format_html('<i>No Crawls yet...</i>')
														
 
															-
														
 
															-    def snapshots(self, obj):
														
 
															-        return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
														
 
															-            (snapshot.admin_change_url, snapshot)
														
 
															-            for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
														
 
															-        )) or format_html('<i>No Snapshots yet...</i>')
														
 
															-
														
 
															-    def contents(self, obj):
														
 
															-        if obj.uri.startswith('file:///data/'):
														
 
															-            source_file = DATA_DIR / obj.uri.replace('file:///data/', '', 1)
														
 
															-            contents = ""
														
 
															-            try:
														
 
															-                contents = source_file.read_text().strip()[:14_000]
														
 
															-            except Exception as e:
														
 
															-                contents = f'Error reading {source_file}: {e}'
														
 
															-                
														
 
															-            return format_html('<b><code>{}</code>:</b><br/><pre>{}</pre>', source_file, contents)
														
 
															-        
														
 
															-        return format_html('See URLs here: <a href="{}">{}</a>', obj.uri, obj.uri)
														
 
															-
														
 
															-
														
 
															[email protected]
														
 
															-def register_admin(admin_site):
														
 
															-    admin_site.register(Seed, SeedAdmin)
														
--- a/archivebox/seeds/apps.py
+++ b/archivebox/seeds/apps.py
@@ -1,6 +0,0 @@
 
															-from django.apps import AppConfig
														
 
															-
														
 
															-
														
 
															-class SeedsConfig(AppConfig):
														
 
															-    default_auto_field = "django.db.models.BigAutoField"
														
 
															-    name = "seeds"
														
--- a/archivebox/seeds/migrations/__init__.py
+++ b/archivebox/seeds/migrations/__init__.py
--- a/archivebox/seeds/models.py
+++ b/archivebox/seeds/models.py
@@ -1,115 +0,0 @@
 
															-__package__ = 'archivebox.seeds'
														
 
															-
														
 
															-from typing import TYPE_CHECKING
														
 
															-from pathlib import Path
														
 
															-
														
 
															-from django.db import models
														
 
															-from django.db.models import QuerySet
														
 
															-from django.conf import settings
														
 
															-from django.urls import reverse_lazy
														
 
															-
														
 
															-from archivebox.config import CONSTANTS
														
 
															-from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats, get_or_create_system_user_pk
														
 
															-
														
 
															-if TYPE_CHECKING:
														
 
															-    from crawls.models import Crawl, CrawlSchedule
														
 
															-    from core.models import Snapshot
														
 
															-
														
 
															-
														
 
															-class Seed(ABIDModel, ModelWithHealthStats):
														
 
															-    """
														
 
															-    A fountain that produces URLs (+metadata) each time it's queried e.g.
														
 
															-        - file:///data/sources/2024-01-02_11-57-51__cli_add.txt
														
 
															-        - file:///data/sources/2024-01-02_11-57-51__web_ui_add.txt
														
 
															-        - file:///Users/squash/Library/Application Support/Google/Chrome/Default/Bookmarks
														
 
															-        - https://getpocket.com/user/nikisweeting/feed
														
 
															-        - https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
														
 
															-        - ...
														
 
															-    Each query of a Seed can produce the same list of URLs, or a different list each time.
														
 
															-    The list of URLs it returns is used to create a new Crawl and seed it with new pending Snapshots.
														
 
															-        
														
 
															-    When a crawl is created, a root_snapshot is initially created with a URI set to the Seed URI.
														
 
															-    The seed's preferred extractor is executed on that URI, which produces an ArchiveResult containing outlinks.
														
 
															-    The outlinks then get turned into new pending Snapshots under the same crawl,
														
 
															-    and the cycle repeats until Crawl.max_depth.
														
 
															-
														
 
															-    Each consumption of a Seed by an Extractor can produce new urls, as Seeds can point to
														
 
															-    stateful remote services, files with contents that change, directories that have new files within, etc.
														
 
															-    """
														
 
															-    
														
 
															-    abid_prefix = 'src_'
														
 
															-    abid_ts_src = 'self.created_at'
														
 
															-    abid_uri_src = 'self.uri'
														
 
															-    abid_subtype_src = 'self.extractor'
														
 
															-    abid_rand_src = 'self.id'
														
 
															-    abid_drift_allowed = True
														
 
															-    
														
 
															-    id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
														
 
															-    abid = ABIDField(prefix=abid_prefix)
														
 
															-    
														
 
															-    uri = models.URLField(max_length=2000, blank=False, null=False)                          # unique source location where URLs will be loaded from
														
 
															-    label = models.CharField(max_length=255, null=False, blank=True, default='', help_text='A human-readable label for this seed')
														
 
															-    notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this seed should have')
														
 
															-    
														
 
															-    extractor = models.CharField(default='auto', max_length=32, help_text='The parser / extractor to use to load URLs from this source (default: auto)')
														
 
															-    tags_str = models.CharField(max_length=255, null=False, blank=True, default='', help_text='An optional comma-separated list of tags to attach to any URLs that come from this source')
														
 
															-    config = models.JSONField(default=dict, help_text='An optional JSON object containing extra config to put in scope when loading URLs from this source')
														
 
															-    
														
 
															-    created_at = AutoDateTimeField(default=None, null=False, db_index=True)
														
 
															-    modified_at = models.DateTimeField(auto_now=True)
														
 
															-    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
														
 
															-
														
 
															-
														
 
															-    crawl_set: models.Manager['Crawl']
														
 
															-
														
 
															-    class Meta:
														
 
															-        verbose_name = 'Seed'
														
 
															-        verbose_name_plural = 'Seeds'
														
 
															-        
														
 
															-        unique_together = (('created_by', 'uri', 'extractor'),)
														
 
															-
														
 
															-
														
 
															-    @classmethod
														
 
															-    def from_file(cls, source_file: Path, label: str='', parser: str='auto', tag: str='', created_by: int|None=None, config: dict|None=None):
														
 
															-        source_path = str(source_file.resolve()).replace(str(CONSTANTS.DATA_DIR), '/data')
														
 
															-        
														
 
															-        seed, _ = cls.objects.get_or_create(
														
 
															-            label=label or source_file.name,
														
 
															-            uri=f'file://{source_path}',
														
 
															-            created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
														
 
															-            extractor=parser,
														
 
															-            tags_str=tag,
														
 
															-            config=config or {},
														
 
															-        )
														
 
															-        seed.save()
														
 
															-        return seed
														
 
															-
														
 
															-    @property
														
 
															-    def source_type(self):
														
 
															-        # e.g. http/https://
														
 
															-        #      file://
														
 
															-        #      pocketapi://
														
 
															-        #      s3://
														
 
															-        #      etc..
														
 
															-        return self.uri.split('://', 1)[0].lower()
														
 
															-
														
 
															-    @property
														
 
															-    def api_url(self) -> str:
														
 
															-        # /api/v1/core/seed/{uulid}
														
 
															-        return reverse_lazy('api-1:get_seed', args=[self.abid])  # + f'?api_key={get_or_create_api_token(request.user)}'
														
 
															-
														
 
															-    @property
														
 
															-    def api_docs_url(self) -> str:
														
 
															-        return '/api/v1/docs#/Core%20Models/api_v1_core_get_seed'
														
 
															-
														
 
															-    @property
														
 
															-    def scheduled_crawl_set(self) -> QuerySet['CrawlSchedule']:
														
 
															-        from crawls.models import CrawlSchedule
														
 
															-        return CrawlSchedule.objects.filter(template__seed_id=self.pk)
														
 
															-
														
 
															-    @property
														
 
															-    def snapshot_set(self) -> QuerySet['Snapshot']:
														
 
															-        from core.models import Snapshot
														
 
															-        
														
 
															-        crawl_ids = self.crawl_set.values_list('pk', flat=True)
														
 
															-        return Snapshot.objects.filter(crawl_id__in=crawl_ids)
														
--- a/archivebox/seeds/tests.py
+++ b/archivebox/seeds/tests.py
@@ -1,3 +0,0 @@
 
															-from django.test import TestCase
														
 
															-
														
 
															-# Create your tests here.
														
--- a/archivebox/seeds/views.py
+++ b/archivebox/seeds/views.py
@@ -1,3 +0,0 @@
 
															-from django.shortcuts import render
														
 
															-
														
 
															-# Create your views here.