Browse Source

add sessions and seeds models

Nick Sweeting 1 year ago
parent
commit
c9f88f5875

+ 5 - 2
archivebox/core/settings.py

@@ -100,10 +100,13 @@ INSTALLED_APPS = [
     'django_object_actions',     # provides easy Django Admin action buttons on change views       https://github.com/crccheck/django-object-actions
 
     # Our ArchiveBox-provided apps
-    'config',                    # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
+    # 'abid_utils',                # handles ABID ID creation, handling, and models
+    'config',                    # ArchiveBox config settings (loaded as a plugin, don't need to add it here) 
     'machine',                   # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
     'queues',                    # handles starting and managing background workers and processes
-    'abid_utils',                # handles ABID ID creation, handling, and models
+    'seeds',                     # handles Seed model and URL source management
+    'crawls',                    # handles Crawl and CrawlSchedule models and management
+    'sessions',                  # handles Persona and session management
     'core',                      # core django model with Snapshot, ArchiveResult, etc.
     'api',                       # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
 

+ 0 - 0
archivebox/seeds/__init__.py


+ 3 - 0
archivebox/seeds/admin.py

@@ -0,0 +1,3 @@
+from django.contrib import admin
+
+# Register your models here.

+ 6 - 0
archivebox/seeds/apps.py

@@ -0,0 +1,6 @@
+from django.apps import AppConfig
+
+
+class SeedsConfig(AppConfig):
+    default_auto_field = "django.db.models.BigAutoField"
+    name = "seeds"

+ 0 - 0
archivebox/seeds/migrations/__init__.py


+ 67 - 0
archivebox/seeds/models.py

@@ -0,0 +1,67 @@
+__package__ = 'archivebox.seeds'
+
+
+from datetime import datetime
+
+from django_stubs_ext.db.models import TypedModelMeta
+
+from django.db import models
+from django.db.models import Q
+from django.core.validators import MaxValueValidator, MinValueValidator 
+from django.conf import settings
+from django.utils import timezone
+from django.utils.functional import cached_property
+from django.urls import reverse_lazy
+
+from pathlib import Path
+
+
+from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats
+
+
+class Seed(ABIDModel, ModelWithHealthStats):
+    """
+    A fountain that produces URLs (+metadata) each time it's queried e.g.
+        - file:///data/sources/2024-01-02_11-57-51__cli_add.txt
+        - file:///data/sources/2024-01-02_11-57-51__web_ui_add.txt
+        - file:///Users/squash/Library/Application Support/Google/Chrome/Default/Bookmarks
+        - https://getpocket.com/user/nikisweeting/feed
+        - https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
+        - ...
+    Each query of a Seed can produce the same list of URLs, or a different list each time.
+    The list of URLs it returns is used to create a new Crawl and seed it with new pending Snapshots.
+        
+    When a crawl is created, a root_snapshot is initially created with a URI set to the Seed URI.
+    The seed's preferred extractor is executed on that URI, which produces an ArchiveResult containing outlinks.
+    The outlinks then get turned into new pending Snapshots under the same crawl,
+    and the cycle repeats until Crawl.max_depth.
+
+    Each consumption of a Seed by an Extractor can produce new urls, as Seeds can point to
+    stateful remote services, files with contents that change, directories that have new files within, etc.
+    """
+    
+    abid_prefix = 'src_'
+    abid_ts_src = 'self.created_at'
+    abid_uri_src = 'self.uri'
+    abid_subtype_src = 'self.extractor'
+    abid_rand_src = 'self.id'
+    abid_drift_allowed = True
+    
+    uri = models.URLField(max_length=255, blank=False, null=False, unique=True)              # unique source location where URLs will be loaded from
+    
+    extractor = models.CharField(default='auto', max_length=32)   # suggested extractor to use to load this URL source
+    tags_str = models.CharField(max_length=255, null=False, blank=True, default='')          # tags to attach to any URLs that come from this source
+    config = models.JSONField(default=dict)                                                  # extra config to put in scope when loading URLs from this source
+    
+    created_at = AutoDateTimeField(default=None, null=False, db_index=True)
+    modified_at = models.DateTimeField(auto_now=True)
+    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
+
+    @property
+    def source_type(self):
+        # e.g. http/https://
+        #      file://
+        #      pocketapi://
+        #      s3://
+        #      etc..
+        return self.uri.split('://')[0].lower()

+ 3 - 0
archivebox/seeds/tests.py

@@ -0,0 +1,3 @@
+from django.test import TestCase
+
+# Create your tests here.

+ 3 - 0
archivebox/seeds/views.py

@@ -0,0 +1,3 @@
+from django.shortcuts import render
+
+# Create your views here.

+ 0 - 0
archivebox/sessions/__init__.py


+ 3 - 0
archivebox/sessions/admin.py

@@ -0,0 +1,3 @@
+from django.contrib import admin
+
+# Register your models here.

+ 6 - 0
archivebox/sessions/apps.py

@@ -0,0 +1,6 @@
+from django.apps import AppConfig
+
+
+class SessionsConfig(AppConfig):
+    default_auto_field = "django.db.models.BigAutoField"
+    name = "sessions"

+ 0 - 0
archivebox/sessions/migrations/__init__.py


+ 67 - 0
archivebox/sessions/models.py

@@ -0,0 +1,67 @@
+from django.db import models
+
+from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats
+
+from django.conf import settings
+
+
+class Persona(ABIDModel, ModelWithHealthStats):
+    """Aka a "SessionType", its a template for a crawler browsing session containing some config."""
+    
+    abid_prefix = 'prs_'
+    abid_ts_src = 'self.created_at'
+    abid_uri_src = 'self.name'
+    abid_subtype_src = 'self.created_by'
+    abid_rand_src = 'self.id'
+    
+    id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
+    abid = ABIDField(prefix=abid_prefix)
+    
+    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
+    created_at = AutoDateTimeField(default=None, null=False, db_index=True)
+    modified_at = models.DateTimeField(auto_now=True)
+    
+    name = models.CharField(max_length=100, blank=False, null=False, editable=False)
+    
+    persona_dir = models.FilePathField(path=settings.PERSONAS_DIR, allow_files=False, allow_folders=True, blank=True, null=False, editable=False)
+    config = models.JSONField(default=dict)
+    # e.g. {
+    #    USER_AGENT: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
+    #    COOKIES_TXT_FILE: '/path/to/cookies.txt',
+    #    CHROME_USER_DATA_DIR: '/path/to/chrome/user/data/dir',
+    #    CHECK_SSL_VALIDITY: False,
+    #    SAVE_ARCHIVE_DOT_ORG: True,
+    #    CHROME_BINARY: 'chromium'
+    #    ...
+    # }
+    # domain_allowlist = models.CharField(max_length=1024, blank=True, null=False, default='')
+    # domain_denylist = models.CharField(max_length=1024, blank=True, null=False, default='')
+    
+    class Meta:
+        verbose_name = 'Session Type'
+        verbose_name_plural = 'Session Types'
+        unique_together = (('created_by', 'name'),)
+    
+
+    def clean(self):
+        self.persona_dir = settings.PERSONAS_DIR / self.name
+        assert self.persona_dir == settings.PERSONAS_DIR / self.name, f'Persona dir {self.persona_dir} must match settings.PERSONAS_DIR / self.name'
+        
+        
+        # make sure config keys all exist in FLAT_CONFIG
+        # make sure config values all match expected types
+        pass
+        
+    def save(self, *args, **kwargs):
+        self.full_clean()
+        
+        # make sure basic file structure is present in persona_dir:
+        # - PERSONAS_DIR / self.name / 
+        #   - chrome_profile/
+        #   - chrome_downloads/
+        #   - chrome_extensions/
+        #   - cookies.txt
+        #   - auth.json
+        #   - config.json    # json dump of the model
+        
+        super().save(*args, **kwargs)

+ 3 - 0
archivebox/sessions/tests.py

@@ -0,0 +1,3 @@
+from django.test import TestCase
+
+# Create your tests here.

+ 3 - 0
archivebox/sessions/views.py

@@ -0,0 +1,3 @@
+from django.shortcuts import render
+
+# Create your views here.