Bladeren bron

move crawl models back into dedicated app

Nick Sweeting 1 jaar geleden
bovenliggende
commit
2a1afcf6c2
3 gewijzigde bestanden met toevoegingen van 310 en 1039 verwijderingen
  1. 1 397
      archivebox/core/models.py
  2. 145 61
      archivebox/crawls/models.py
  3. 164 581
      archivebox/workers/actor.py

+ 1 - 397
archivebox/core/models.py

@@ -41,6 +41,7 @@ from workers.tasks import bg_archive_snapshot
 from tags.models import KVTag
 from tags.models import KVTag
 # from machine.models import Machine, NetworkInterface
 # from machine.models import Machine, NetworkInterface
 
 
+from crawls.models import Seed, Crawl, CrawlSchedule
 
 
 
 
 class Tag(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ABIDModel):
 class Tag(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ABIDModel):
@@ -133,403 +134,6 @@ class SnapshotTag(models.Model):
         unique_together = [('snapshot', 'tag')]
         unique_together = [('snapshot', 'tag')]
 
 
 
 
-class Seed(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ModelWithKVTags, ABIDModel, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats):
-    """
-    A fountain that produces URLs (+metadata) each time it's queried e.g.
-        - file:///data/sources/2024-01-02_11-57-51__cli_add.txt
-        - file:///data/sources/2024-01-02_11-57-51__web_ui_add.txt
-        - file:///Users/squash/Library/Application Support/Google/Chrome/Default/Bookmarks
-        - https://getpocket.com/user/nikisweeting/feed
-        - https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
-        - ...
-    Each query of a Seed can produce the same list of URLs, or a different list each time.
-    The list of URLs it returns is used to create a new Crawl and seed it with new pending Snapshots.
-        
-    When a crawl is created, a root_snapshot is initially created with a URI set to the Seed URI.
-    The seed's preferred extractor is executed on that URI, which produces an ArchiveResult containing outlinks.
-    The outlinks then get turned into new pending Snapshots under the same crawl,
-    and the cycle repeats until Crawl.max_depth.
-
-    Each consumption of a Seed by an Extractor can produce new urls, as Seeds can point to
-    stateful remote services, files with contents that change, directories that have new files within, etc.
-    """
-    
-    ### ModelWithReadOnlyFields:
-    read_only_fields = ('id', 'abid', 'created_at', 'created_by', 'uri')
-    
-    ### Immutable fields
-    id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
-    abid = ABIDField(prefix=abid_prefix)
-    created_at = AutoDateTimeField(default=None, null=False, db_index=True)                  # unique source location where URLs will be loaded from
-    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
-    
-    ### Mutable fields:
-    extractor = models.CharField(default='auto', max_length=32, help_text='The parser / extractor to use to load URLs from this source (default: auto)')
-    tags_str = models.CharField(max_length=255, null=False, blank=True, default='', help_text='An optional comma-separated list of tags to attach to any URLs that come from this source')
-    label = models.CharField(max_length=255, null=False, blank=True, default='', help_text='A human-readable label for this seed')
-    modified_at = models.DateTimeField(auto_now=True)
-
-    ### ModelWithConfig:
-    config = models.JSONField(default=dict, help_text='An optional JSON object containing extra config to put in scope when loading URLs from this source')
-
-    ### ModelWithOutputDir:
-    output_dir = models.CharField(max_length=255, null=False, blank=True, default='', help_text='The directory to store the output of this seed')
-
-    ### ModelWithNotes:
-    notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this seed should have')
-
-    ### ModelWithKVTags:
-    tag_set = GenericRelation(
-        KVTag,
-        related_query_name="seed",
-        content_type_field="obj_type",
-        object_id_field="obj_id",
-        order_by=('name',),
-    )
-    
-    ### ABIDModel:
-    abid_prefix = 'src_'
-    abid_ts_src = 'self.created_at'
-    abid_uri_src = 'self.uri'
-    abid_subtype_src = 'self.extractor'
-    abid_rand_src = 'self.id'
-    abid_drift_allowed = True
-    
-    ### Managers:
-    crawl_set: models.Manager['Crawl']
-
-    class Meta:
-        verbose_name = 'Seed'
-        verbose_name_plural = 'Seeds'
-        
-        unique_together = (('created_by', 'uri', 'extractor'),('created_by', 'label'))
-
-
-    @classmethod
-    def from_file(cls, source_file: Path, label: str='', parser: str='auto', tag: str='', created_by: int|None=None, config: dict|None=None):
-        source_path = str(source_file.resolve()).replace(str(CONSTANTS.DATA_DIR), '/data')
-        
-        seed, _ = cls.objects.get_or_create(
-            label=label or source_file.name,
-            uri=f'file://{source_path}',
-            created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
-            extractor=parser,
-            tags_str=tag,
-            config=config or {},
-        )
-        seed.save()
-        return seed
-
-    @property
-    def source_type(self):
-        # e.g. http/https://
-        #      file://
-        #      pocketapi://
-        #      s3://
-        #      etc..
-        return self.uri.split('://', 1)[0].lower()
-
-    @property
-    def api_url(self) -> str:
-        # /api/v1/core/seed/{uulid}
-        return reverse_lazy('api-1:get_seed', args=[self.abid])  # + f'?api_key={get_or_create_api_token(request.user)}'
-
-    @property
-    def api_docs_url(self) -> str:
-        return '/api/v1/docs#/Core%20Models/api_v1_core_get_seed'
-
-    @property
-    def scheduled_crawl_set(self) -> QuerySet['CrawlSchedule']:
-        from crawls.models import CrawlSchedule
-        return CrawlSchedule.objects.filter(template__seed_id=self.pk)
-
-    @property
-    def snapshot_set(self) -> QuerySet['Snapshot']:
-        from core.models import Snapshot
-        
-        crawl_ids = self.crawl_set.values_list('pk', flat=True)
-        return Snapshot.objects.filter(crawl_id__in=crawl_ids)
-
-
-
-
-class CrawlSchedule(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ModelWithKVTags, ABIDModel, ModelWithNotes, ModelWithHealthStats):
-    """
-    A record for a job that should run repeatedly on a given schedule.
-    
-    It pulls from a given Seed and creates a new Crawl for each scheduled run.
-    The new Crawl will inherit all the properties of the crawl_template Crawl.
-    """
-    ### ModelWithReadOnlyFields:
-    read_only_fields = ('id', 'abid', 'created_at', 'created_by', 'template_id')
-    
-    ### Immutable fields:
-    id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
-    abid = ABIDField(prefix=abid_prefix)
-    created_at = AutoDateTimeField(default=None, null=False, db_index=True)
-    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
-    template: 'Crawl' = models.ForeignKey('Crawl', on_delete=models.CASCADE, null=False, blank=False, help_text='The base crawl that each new scheduled job should copy as a template')  # type: ignore
-    
-    ### Mutable fields
-    schedule = models.CharField(max_length=64, blank=False, null=False, help_text='The schedule to run this crawl on in CRON syntax e.g. 0 0 * * * (see https://crontab.guru/)')
-    is_enabled = models.BooleanField(default=True)
-    label = models.CharField(max_length=64, blank=True, null=False, default='', help_text='A human-readable label for this scheduled crawl')
-    notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this crawl should have')
-    modified_at = models.DateTimeField(auto_now=True)
-    
-    ### ModelWithKVTags:
-    tag_set = GenericRelation(
-        KVTag,
-        related_query_name="crawlschedule",
-        content_type_field="obj_type",
-        object_id_field="obj_id",
-        order_by=('name',),
-    )
-    
-    ### ABIDModel:
-    abid_prefix = 'cws_'
-    abid_ts_src = 'self.created_at'
-    abid_uri_src = 'self.template.seed.uri'
-    abid_subtype_src = 'self.template.persona'
-    abid_rand_src = 'self.id'
-    abid_drift_allowed = True
-    
-    ### Managers:
-    crawl_set: models.Manager['Crawl']
-    
-    class Meta(TypedModelMeta):
-        verbose_name = 'Scheduled Crawl'
-        verbose_name_plural = 'Scheduled Crawls'
-        
-    def __str__(self) -> str:
-        uri = (self.template and self.template.seed and self.template.seed.uri) or '<no url set>'
-        crawl_label = self.label or (self.template and self.template.seed and self.template.seed.label) or 'Untitled Crawl'
-        if self.id and self.template:
-            return f'[{self.ABID}] {uri[:64]} @ {self.schedule} (Scheduled {crawl_label})'
-        return f'[{self.abid_prefix}****not*saved*yet****] {uri[:64]} @ {self.schedule} (Scheduled {crawl_label})'
-    
-    @property
-    def api_url(self) -> str:
-        # /api/v1/core/crawlschedule/{uulid}
-        return reverse_lazy('api-1:get_any', args=[self.abid])  # + f'?api_key={get_or_create_api_token(request.user)}'
-
-    @property
-    def api_docs_url(self) -> str:
-        return '/api/v1/docs#/Core%20Models/api_v1_core_get_any'
-    
-    def save(self, *args, **kwargs):
-        self.label = self.label or self.template.seed.label or self.template.seed.uri
-        super().save(*args, **kwargs)
-        
-        # make sure the template crawl points to this schedule as its schedule
-        self.template.schedule = self
-        self.template.save()
-        
-    @property
-    def snapshot_set(self) -> QuerySet['Snapshot']:
-        from core.models import Snapshot
-        
-        crawl_ids = self.crawl_set.values_list('pk', flat=True)
-        return Snapshot.objects.filter(crawl_id__in=crawl_ids)
-    
-
-class CrawlManager(models.Manager):
-    pass
-
-class CrawlQuerySet(models.QuerySet):
-    """
-    Enhanced QuerySet for Crawl that adds some useful methods.
-    
-    To get all the snapshots for a given set of Crawls:
-        Crawl.objects.filter(seed__uri='https://example.com/some/rss.xml').snapshots() -> QuerySet[Snapshot]
-    
-    To get all the archiveresults for a given set of Crawls:
-        Crawl.objects.filter(seed__uri='https://example.com/some/rss.xml').archiveresults() -> QuerySet[ArchiveResult]
-    
-    To export the list of Crawls as a CSV or JSON:
-        Crawl.objects.filter(seed__uri='https://example.com/some/rss.xml').export_as_csv() -> str
-        Crawl.objects.filter(seed__uri='https://example.com/some/rss.xml').export_as_json() -> str
-    """
-    def snapshots(self, **filter_kwargs) -> QuerySet['Snapshot']:
-        return Snapshot.objects.filter(crawl_id__in=self.values_list('pk', flat=True), **filter_kwargs)
-    
-    def archiveresults(self) -> QuerySet['ArchiveResult']:
-        return ArchiveResult.objects.filter(snapshot__crawl_id__in=self.values_list('pk', flat=True))
-    
-    def as_csv_str(self, keys: Iterable[str]=()) -> str:
-        return '\n'.join(
-            row.as_csv(keys=keys)
-            for row in self.all()
-        )
-    
-    def as_jsonl_str(self, keys: Iterable[str]=()) -> str:
-        return '\n'.join([
-            row.as_jsonl_row(keys=keys)
-            for row in self.all()
-        ])
-
-
-
-class Crawl(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ModelWithKVTags, ABIDModel, ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWithStateMachine):
-    """
-    A single session of URLs to archive starting from a given Seed and expanding outwards. An "archiving session" so to speak.
-
-    A new Crawl should be created for each loading from a Seed (because it can produce a different set of URLs every time its loaded).
-    E.g. every scheduled import from an RSS feed should create a new Crawl, and more loadings from the same seed each create a new Crawl
-    
-    Every "Add" task triggered from the Web UI, CLI, or Scheduled Crawl should create a new Crawl with the seed set to a 
-    file URI e.g. file:///sources/<date>_{ui,cli}_add.txt containing the user's input.
-    """
-    
-    ### ModelWithReadOnlyFields:
-    read_only_fields = ('id', 'abid', 'created_at', 'created_by', 'seed')
-    
-    ### Immutable fields:
-    id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
-    abid = ABIDField(prefix=abid_prefix)
-    created_at = AutoDateTimeField(default=None, null=False, db_index=True)
-    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
-    seed = models.ForeignKey(Seed, on_delete=models.PROTECT, related_name='crawl_set', null=False, blank=False)
-    
-    ### Mutable fields:
-    urls = models.TextField(blank=True, null=False, default='', help_text='The log of URLs discovered in this crawl, one per line, should be 1:1 with snapshot_set')
-    config = models.JSONField(default=dict)
-    max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)])
-    tags_str = models.CharField(max_length=1024, blank=True, null=False, default='')
-    persona_id = models.UUIDField(null=True, blank=True)  # TODO: replace with self.persona = models.ForeignKey(Persona, on_delete=models.SET_NULL, null=True, blank=True, editable=True)
-    label = models.CharField(max_length=64, blank=True, null=False, default='', help_text='A human-readable label for this crawl')
-    notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this crawl should have')
-    schedule = models.ForeignKey(CrawlSchedule, on_delete=models.SET_NULL, null=True, blank=True, editable=True)
-    modified_at = models.DateTimeField(auto_now=True)
-    
-    ### ModelWithKVTags:
-    tag_set = GenericRelation(
-        KVTag,
-        related_query_name="crawl",
-        content_type_field="obj_type",
-        object_id_field="obj_id",
-        order_by=('name',),
-    )
-    
-    ### ModelWithStateMachine:
-    state_machine_name = 'crawls.statemachines.CrawlMachine'
-    retry_at_field_name = 'retry_at'
-    state_field_name = 'status'
-    StatusChoices = ModelWithStateMachine.StatusChoices
-    active_state = StatusChoices.STARTED
-    
-    status = ModelWithStateMachine.StatusField(choices=StatusChoices, default=StatusChoices.QUEUED)
-    retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
-
-    ### ABIDModel:
-    abid_prefix = 'cwl_'
-    abid_ts_src = 'self.created_at'
-    abid_uri_src = 'self.seed.uri'
-    abid_subtype_src = 'self.persona'
-    abid_rand_src = 'self.id'
-    abid_drift_allowed = True
-    
-    ### Managers:    
-    snapshot_set: models.Manager['Snapshot']
-    
-    # @property
-    # def persona(self) -> Persona:
-    #     # TODO: replace with self.persona = models.ForeignKey(Persona, on_delete=models.SET_NULL, null=True, blank=True, editable=True)
-    #     return self.persona_id
-    
-
-    class Meta(TypedModelMeta):
-        verbose_name = 'Crawl'
-        verbose_name_plural = 'Crawls'
-        
-    def __str__(self):
-        url = (self.seed and self.seed.uri) or '<no url set>'
-        parser = (self.seed and self.seed.extractor) or 'auto'
-        created_at = self.created_at.strftime("%Y-%m-%d %H:%M") if self.created_at else '<no timestamp set>'
-        if self.id and self.seed:
-            return f'[{self.ABID}] {url[:64]} ({parser}) @ {created_at} ({self.label or "Untitled Crawl"})'
-        return f'[{self.abid_prefix}****not*saved*yet****] {url[:64]} ({parser}) @ {created_at} ({self.label or "Untitled Crawl"})'
-        
-    @classmethod
-    def from_seed(cls, seed: Seed, max_depth: int=0, persona: str='Default', tags_str: str='', config: dict|None=None, created_by: int|None=None):
-        crawl, _ = cls.objects.get_or_create(
-            seed=seed,
-            max_depth=max_depth,
-            tags_str=tags_str or seed.tags_str,
-            persona=persona or seed.config.get('DEFAULT_PERSONA') or 'Default',
-            config=seed.config or config or {},
-            created_by_id=getattr(created_by, 'pk', created_by) or seed.created_by_id,
-        )
-        crawl.save()
-        return crawl
-        
-    @property
-    def template(self):
-        """If this crawl was created under a ScheduledCrawl, returns the original template Crawl it was based off"""
-        if not self.schedule:
-            return None
-        return self.schedule.template
-
-    @property
-    def api_url(self) -> str:
-        # /api/v1/core/crawl/{uulid}
-        # TODO: implement get_crawl
-        return reverse_lazy('api-1:get_crawl', args=[self.abid])  # + f'?api_key={get_or_create_api_token(request.user)}'
-
-    @property
-    def api_docs_url(self) -> str:
-        return '/api/v1/docs#/Core%20Models/api_v1_core_get_crawl'
-    
-    def pending_snapshots(self) -> QuerySet['Snapshot']:
-        return self.snapshot_set.filter(retry_at__isnull=False)
-    
-    def pending_archiveresults(self) -> QuerySet['ArchiveResult']:
-        from core.models import ArchiveResult
-        
-        snapshot_ids = self.snapshot_set.values_list('id', flat=True)
-        pending_archiveresults = ArchiveResult.objects.filter(snapshot_id__in=snapshot_ids, retry_at__isnull=False)
-        return pending_archiveresults
-    
-    def create_root_snapshot(self) -> 'Snapshot':
-        print(f'Crawl[{self.ABID}].create_root_snapshot()')
-        from core.models import Snapshot
-        
-        try:
-            return Snapshot.objects.get(crawl=self, url=self.seed.uri)
-        except Snapshot.DoesNotExist:
-            pass
-
-        root_snapshot, _ = Snapshot.objects.update_or_create(
-            crawl=self,
-            url=self.seed.uri,
-            defaults={
-                'status': Snapshot.INITIAL_STATE,
-                'retry_at': timezone.now(),
-                'timestamp': str(timezone.now().timestamp()),
-                # 'config': self.seed.config,
-            },
-        )
-        root_snapshot.save()
-        return root_snapshot
-
-
-class Outlink(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ModelWithKVTags):
-    """A record of a link found on a page, pointing to another page."""
-    read_only_fields = ('id', 'src', 'dst', 'crawl', 'via')
-    
-    id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
-    
-    src = models.URLField()   # parent page where the outlink/href was found       e.g. https://example.com/downloads
-    dst = models.URLField()   # remote location the child outlink/href points to   e.g. https://example.com/downloads/some_file.pdf
-    
-    crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, null=False, blank=False, related_name='outlink_set')
-    via = models.ForeignKey('core.ArchiveResult', on_delete=models.SET_NULL, null=True, blank=True, related_name='outlink_set')
-
-    class Meta:
-        unique_together = (('src', 'dst', 'via'),)
-
-
-
 
 
 def validate_timestamp(value):
 def validate_timestamp(value):
     assert isinstance(value, str) and value, f'timestamp must be a non-empty string, got: "{value}"'
     assert isinstance(value, str) and value, f'timestamp must be a non-empty string, got: "{value}"'

+ 145 - 61
archivebox/crawls/models.py

@@ -12,7 +12,7 @@ from django.urls import reverse_lazy
 from django.utils import timezone
 from django.utils import timezone
 
 
 from archivebox.config import CONSTANTS
 from archivebox.config import CONSTANTS
-from archivebox.base_models.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats, get_or_create_system_user_pk
+from base_models.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats, get_or_create_system_user_pk
 
 
 from workers.models import ModelWithStateMachine
 from workers.models import ModelWithStateMachine
 
 
@@ -21,7 +21,8 @@ if TYPE_CHECKING:
 
 
 
 
 
 
-class Seed(ABIDModel, ModelWithHealthStats):
+
+class Seed(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ModelWithKVTags, ABIDModel, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats):
     """
     """
     A fountain that produces URLs (+metadata) each time it's queried e.g.
     A fountain that produces URLs (+metadata) each time it's queried e.g.
         - file:///data/sources/2024-01-02_11-57-51__cli_add.txt
         - file:///data/sources/2024-01-02_11-57-51__cli_add.txt
@@ -42,36 +43,55 @@ class Seed(ABIDModel, ModelWithHealthStats):
     stateful remote services, files with contents that change, directories that have new files within, etc.
     stateful remote services, files with contents that change, directories that have new files within, etc.
     """
     """
     
     
-    abid_prefix = 'src_'
-    abid_ts_src = 'self.created_at'
-    abid_uri_src = 'self.uri'
-    abid_subtype_src = 'self.extractor'
-    abid_rand_src = 'self.id'
-    abid_drift_allowed = True
+    ### ModelWithReadOnlyFields:
+    read_only_fields = ('id', 'abid', 'created_at', 'created_by', 'uri')
     
     
+    ### Immutable fields
     id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
     id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
     abid = ABIDField(prefix=abid_prefix)
     abid = ABIDField(prefix=abid_prefix)
+    created_at = AutoDateTimeField(default=None, null=False, db_index=True)                  # unique source location where URLs will be loaded from
+    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
     
     
-    uri = models.URLField(max_length=2000, blank=False, null=False)                          # unique source location where URLs will be loaded from
-    label = models.CharField(max_length=255, null=False, blank=True, default='', help_text='A human-readable label for this seed')
-    notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this seed should have')
-    
+    ### Mutable fields:
     extractor = models.CharField(default='auto', max_length=32, help_text='The parser / extractor to use to load URLs from this source (default: auto)')
     extractor = models.CharField(default='auto', max_length=32, help_text='The parser / extractor to use to load URLs from this source (default: auto)')
     tags_str = models.CharField(max_length=255, null=False, blank=True, default='', help_text='An optional comma-separated list of tags to attach to any URLs that come from this source')
     tags_str = models.CharField(max_length=255, null=False, blank=True, default='', help_text='An optional comma-separated list of tags to attach to any URLs that come from this source')
-    config = models.JSONField(default=dict, help_text='An optional JSON object containing extra config to put in scope when loading URLs from this source')
-    
-    created_at = AutoDateTimeField(default=None, null=False, db_index=True)
+    label = models.CharField(max_length=255, null=False, blank=True, default='', help_text='A human-readable label for this seed')
     modified_at = models.DateTimeField(auto_now=True)
     modified_at = models.DateTimeField(auto_now=True)
-    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
 
 
+    ### ModelWithConfig:
+    config = models.JSONField(default=dict, help_text='An optional JSON object containing extra config to put in scope when loading URLs from this source')
+
+    ### ModelWithOutputDir:
+    output_dir = models.CharField(max_length=255, null=False, blank=True, default='', help_text='The directory to store the output of this seed')
 
 
+    ### ModelWithNotes:
+    notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this seed should have')
+
+    ### ModelWithKVTags:
+    tag_set = GenericRelation(
+        KVTag,
+        related_query_name="seed",
+        content_type_field="obj_type",
+        object_id_field="obj_id",
+        order_by=('name',),
+    )
+    
+    ### ABIDModel:
+    abid_prefix = 'src_'
+    abid_ts_src = 'self.created_at'
+    abid_uri_src = 'self.uri'
+    abid_subtype_src = 'self.extractor'
+    abid_rand_src = 'self.id'
+    abid_drift_allowed = True
+    
+    ### Managers:
     crawl_set: models.Manager['Crawl']
     crawl_set: models.Manager['Crawl']
 
 
     class Meta:
     class Meta:
         verbose_name = 'Seed'
         verbose_name = 'Seed'
         verbose_name_plural = 'Seeds'
         verbose_name_plural = 'Seeds'
         
         
-        unique_together = (('created_by', 'uri', 'extractor'),)
+        unique_together = (('created_by', 'uri', 'extractor'),('created_by', 'label'))
 
 
 
 
     @classmethod
     @classmethod
@@ -122,35 +142,48 @@ class Seed(ABIDModel, ModelWithHealthStats):
 
 
 
 
 
 
-
-class CrawlSchedule(ABIDModel, ModelWithHealthStats):
+class CrawlSchedule(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ModelWithKVTags, ABIDModel, ModelWithNotes, ModelWithHealthStats):
     """
     """
     A record for a job that should run repeatedly on a given schedule.
     A record for a job that should run repeatedly on a given schedule.
     
     
     It pulls from a given Seed and creates a new Crawl for each scheduled run.
     It pulls from a given Seed and creates a new Crawl for each scheduled run.
     The new Crawl will inherit all the properties of the crawl_template Crawl.
     The new Crawl will inherit all the properties of the crawl_template Crawl.
     """
     """
-    abid_prefix = 'cws_'
-    abid_ts_src = 'self.created_at'
-    abid_uri_src = 'self.created_by_id'
-    abid_subtype_src = 'self.schedule'
-    abid_rand_src = 'self.id'
+    ### ModelWithReadOnlyFields:
+    read_only_fields = ('id', 'abid', 'created_at', 'created_by', 'template_id')
     
     
+    ### Immutable fields:
     id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
     id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
     abid = ABIDField(prefix=abid_prefix)
     abid = ABIDField(prefix=abid_prefix)
+    created_at = AutoDateTimeField(default=None, null=False, db_index=True)
+    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
+    template: 'Crawl' = models.ForeignKey('Crawl', on_delete=models.CASCADE, null=False, blank=False, help_text='The base crawl that each new scheduled job should copy as a template')  # type: ignore
     
     
+    ### Mutable fields
     schedule = models.CharField(max_length=64, blank=False, null=False, help_text='The schedule to run this crawl on in CRON syntax e.g. 0 0 * * * (see https://crontab.guru/)')
     schedule = models.CharField(max_length=64, blank=False, null=False, help_text='The schedule to run this crawl on in CRON syntax e.g. 0 0 * * * (see https://crontab.guru/)')
+    is_enabled = models.BooleanField(default=True)
     label = models.CharField(max_length=64, blank=True, null=False, default='', help_text='A human-readable label for this scheduled crawl')
     label = models.CharField(max_length=64, blank=True, null=False, default='', help_text='A human-readable label for this scheduled crawl')
     notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this crawl should have')
     notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this crawl should have')
-    
-    template: 'Crawl' = models.ForeignKey('Crawl', on_delete=models.CASCADE, null=False, blank=False, help_text='The base crawl that each new scheduled job should copy as a template')  # type: ignore
-    
-    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
-    created_at = AutoDateTimeField(default=None, null=False, db_index=True)
     modified_at = models.DateTimeField(auto_now=True)
     modified_at = models.DateTimeField(auto_now=True)
     
     
-    is_enabled = models.BooleanField(default=True)
+    ### ModelWithKVTags:
+    tag_set = GenericRelation(
+        KVTag,
+        related_query_name="crawlschedule",
+        content_type_field="obj_type",
+        object_id_field="obj_id",
+        order_by=('name',),
+    )
+    
+    ### ABIDModel:
+    abid_prefix = 'cws_'
+    abid_ts_src = 'self.created_at'
+    abid_uri_src = 'self.template.seed.uri'
+    abid_subtype_src = 'self.template.persona'
+    abid_rand_src = 'self.id'
+    abid_drift_allowed = True
     
     
+    ### Managers:
     crawl_set: models.Manager['Crawl']
     crawl_set: models.Manager['Crawl']
     
     
     class Meta(TypedModelMeta):
     class Meta(TypedModelMeta):
@@ -189,9 +222,44 @@ class CrawlSchedule(ABIDModel, ModelWithHealthStats):
         return Snapshot.objects.filter(crawl_id__in=crawl_ids)
         return Snapshot.objects.filter(crawl_id__in=crawl_ids)
     
     
 
 
+class CrawlManager(models.Manager):
+    pass
+
+class CrawlQuerySet(models.QuerySet):
+    """
+    Enhanced QuerySet for Crawl that adds some useful methods.
+    
+    To get all the snapshots for a given set of Crawls:
+        Crawl.objects.filter(seed__uri='https://example.com/some/rss.xml').snapshots() -> QuerySet[Snapshot]
+    
+    To get all the archiveresults for a given set of Crawls:
+        Crawl.objects.filter(seed__uri='https://example.com/some/rss.xml').archiveresults() -> QuerySet[ArchiveResult]
+    
+    To export the list of Crawls as a CSV or JSON:
+        Crawl.objects.filter(seed__uri='https://example.com/some/rss.xml').export_as_csv() -> str
+        Crawl.objects.filter(seed__uri='https://example.com/some/rss.xml').export_as_json() -> str
+    """
+    def snapshots(self, **filter_kwargs) -> QuerySet['Snapshot']:
+        return Snapshot.objects.filter(crawl_id__in=self.values_list('pk', flat=True), **filter_kwargs)
     
     
+    def archiveresults(self) -> QuerySet['ArchiveResult']:
+        return ArchiveResult.objects.filter(snapshot__crawl_id__in=self.values_list('pk', flat=True))
+    
+    def as_csv_str(self, keys: Iterable[str]=()) -> str:
+        return '\n'.join(
+            row.as_csv(keys=keys)
+            for row in self.all()
+        )
+    
+    def as_jsonl_str(self, keys: Iterable[str]=()) -> str:
+        return '\n'.join([
+            row.as_jsonl_row(keys=keys)
+            for row in self.all()
+        ])
 
 
-class Crawl(ABIDModel, ModelWithHealthStats, ModelWithStateMachine):
+
+
+class Crawl(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ModelWithKVTags, ABIDModel, ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWithStateMachine):
     """
     """
     A single session of URLs to archive starting from a given Seed and expanding outwards. An "archiving session" so to speak.
     A single session of URLs to archive starting from a given Seed and expanding outwards. An "archiving session" so to speak.
 
 
@@ -201,49 +269,63 @@ class Crawl(ABIDModel, ModelWithHealthStats, ModelWithStateMachine):
     Every "Add" task triggered from the Web UI, CLI, or Scheduled Crawl should create a new Crawl with the seed set to a 
     Every "Add" task triggered from the Web UI, CLI, or Scheduled Crawl should create a new Crawl with the seed set to a 
     file URI e.g. file:///sources/<date>_{ui,cli}_add.txt containing the user's input.
     file URI e.g. file:///sources/<date>_{ui,cli}_add.txt containing the user's input.
     """
     """
-    abid_prefix = 'cwl_'
-    abid_ts_src = 'self.created_at'
-    abid_uri_src = 'self.seed.uri'
-    abid_subtype_src = 'self.persona'
-    abid_rand_src = 'self.id'
-    abid_drift_allowed = True
     
     
-    state_machine_name = 'crawls.statemachines.CrawlMachine'
-    retry_at_field_name = 'retry_at'
-    state_field_name = 'status'
-    StatusChoices = ModelWithStateMachine.StatusChoices
-    active_state = StatusChoices.STARTED
+    ### ModelWithReadOnlyFields:
+    read_only_fields = ('id', 'abid', 'created_at', 'created_by', 'seed')
     
     
+    ### Immutable fields:
     id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
     id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
     abid = ABIDField(prefix=abid_prefix)
     abid = ABIDField(prefix=abid_prefix)
-
-    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='crawl_set')
     created_at = AutoDateTimeField(default=None, null=False, db_index=True)
     created_at = AutoDateTimeField(default=None, null=False, db_index=True)
-    modified_at = models.DateTimeField(auto_now=True)
-    
-    status = ModelWithStateMachine.StatusField(choices=StatusChoices, default=StatusChoices.QUEUED)
-    retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
-
+    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
     seed = models.ForeignKey(Seed, on_delete=models.PROTECT, related_name='crawl_set', null=False, blank=False)
     seed = models.ForeignKey(Seed, on_delete=models.PROTECT, related_name='crawl_set', null=False, blank=False)
-    urls = models.TextField(blank=True, null=False, default='', help_text='The log of URLs discovered in this crawl')
     
     
+    ### Mutable fields:
+    urls = models.TextField(blank=True, null=False, default='', help_text='The log of URLs discovered in this crawl, one per line, should be 1:1 with snapshot_set')
+    config = models.JSONField(default=dict)
+    max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)])
+    tags_str = models.CharField(max_length=1024, blank=True, null=False, default='')
+    persona_id = models.UUIDField(null=True, blank=True)  # TODO: replace with self.persona = models.ForeignKey(Persona, on_delete=models.SET_NULL, null=True, blank=True, editable=True)
     label = models.CharField(max_length=64, blank=True, null=False, default='', help_text='A human-readable label for this crawl')
     label = models.CharField(max_length=64, blank=True, null=False, default='', help_text='A human-readable label for this crawl')
     notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this crawl should have')
     notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this crawl should have')
+    schedule = models.ForeignKey(CrawlSchedule, on_delete=models.SET_NULL, null=True, blank=True, editable=True)
+    modified_at = models.DateTimeField(auto_now=True)
     
     
-    max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)])
-    tags_str = models.CharField(max_length=1024, blank=True, null=False, default='')
-    persona = models.CharField(max_length=32, blank=True, null=False, default='auto')
-    config = models.JSONField(default=dict)
+    ### ModelWithKVTags:
+    tag_set = GenericRelation(
+        KVTag,
+        related_query_name="crawl",
+        content_type_field="obj_type",
+        object_id_field="obj_id",
+        order_by=('name',),
+    )
     
     
-    schedule = models.ForeignKey(CrawlSchedule, on_delete=models.SET_NULL, null=True, blank=True, editable=True)
+    ### ModelWithStateMachine:
+    state_machine_name = 'crawls.statemachines.CrawlMachine'
+    retry_at_field_name = 'retry_at'
+    state_field_name = 'status'
+    StatusChoices = ModelWithStateMachine.StatusChoices
+    active_state = StatusChoices.STARTED
     
     
-    # crawler = models.CharField(choices=CRAWLER_CHOICES, default='breadth_first', max_length=32)
-    # tags = models.ManyToManyField(Tag, blank=True, related_name='crawl_set', through='CrawlTag')
-    # schedule = models.JSONField()
-    # config = models.JSONField()
+    status = ModelWithStateMachine.StatusField(choices=StatusChoices, default=StatusChoices.QUEUED)
+    retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
+
+    ### ABIDModel:
+    abid_prefix = 'cwl_'
+    abid_ts_src = 'self.created_at'
+    abid_uri_src = 'self.seed.uri'
+    abid_subtype_src = 'self.persona'
+    abid_rand_src = 'self.id'
+    abid_drift_allowed = True
     
     
+    ### Managers:    
     snapshot_set: models.Manager['Snapshot']
     snapshot_set: models.Manager['Snapshot']
     
     
+    # @property
+    # def persona(self) -> Persona:
+    #     # TODO: replace with self.persona = models.ForeignKey(Persona, on_delete=models.SET_NULL, null=True, blank=True, editable=True)
+    #     return self.persona_id
+    
 
 
     class Meta(TypedModelMeta):
     class Meta(TypedModelMeta):
         verbose_name = 'Crawl'
         verbose_name = 'Crawl'
@@ -305,7 +387,7 @@ class Crawl(ABIDModel, ModelWithHealthStats, ModelWithStateMachine):
             return Snapshot.objects.get(crawl=self, url=self.seed.uri)
             return Snapshot.objects.get(crawl=self, url=self.seed.uri)
         except Snapshot.DoesNotExist:
         except Snapshot.DoesNotExist:
             pass
             pass
-  
+
         root_snapshot, _ = Snapshot.objects.update_or_create(
         root_snapshot, _ = Snapshot.objects.update_or_create(
             crawl=self,
             crawl=self,
             url=self.seed.uri,
             url=self.seed.uri,
@@ -320,8 +402,10 @@ class Crawl(ABIDModel, ModelWithHealthStats, ModelWithStateMachine):
         return root_snapshot
         return root_snapshot
 
 
 
 
-class Outlink(models.Model):
+class Outlink(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ModelWithKVTags):
     """A record of a link found on a page, pointing to another page."""
     """A record of a link found on a page, pointing to another page."""
+    read_only_fields = ('id', 'src', 'dst', 'crawl', 'via')
+    
     id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
     id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
     
     
     src = models.URLField()   # parent page where the outlink/href was found       e.g. https://example.com/downloads
     src = models.URLField()   # parent page where the outlink/href was found       e.g. https://example.com/downloads

+ 164 - 581
archivebox/workers/actor.py

@@ -1,583 +1,166 @@
-__package__ = 'archivebox.workers'
-
-import os
-import time
-import traceback
-from typing import ClassVar, Generic, TypeVar, Any, Literal, Type, Iterable, cast, get_args
-from datetime import timedelta
-import multiprocessing
-from multiprocessing import Process, cpu_count
-
-import psutil
-from rich import print
-from statemachine import State, StateMachine
-
-from django import db
-from django.db.models import QuerySet, sql, Q
-from django.db.models import Model as DjangoModel
-from django.utils import timezone
-from django.utils.functional import classproperty
-
-# from archivebox.logging_util import TimedProgress
-
-from .models import ModelWithStateMachine
-
-
-multiprocessing.set_start_method('fork', force=True)
-
-
-class ActorObjectAlreadyClaimed(Exception):
-    """Raised when the Actor tries to claim the next object from the queue but it's already been claimed by another Actor"""
-    pass
-
-class ActorQueueIsEmpty(Exception):
-    """Raised when the Actor tries to get the next object from the queue but it's empty"""
-    pass
-
-CPU_COUNT = cpu_count()
-DEFAULT_MAX_TICK_TIME = 60
-DEFAULT_MAX_CONCURRENT_ACTORS = min(max(2, int(CPU_COUNT * 0.6)), 8)   # 2 < (60% * num available cpu cores) < 8
-
-limit = lambda n, max: min(n, max)
-
-LaunchKwargs = dict[str, Any]
-ObjectState = State | str
-ObjectStateList = Iterable[ObjectState]
-
-ModelType = TypeVar('ModelType', bound=ModelWithStateMachine)
-
-class ActorType(Generic[ModelType]):
-    """
-    Base class for all actors. Usage:
-    
-    class FaviconActor(ActorType[FaviconArchiveResult]):
-        ACTIVE_STATE: ClassVar[str] = 'started'
-        
-        @classmethod
-        def qs(cls) -> QuerySet[FaviconArchiveResult]:
-            return ArchiveResult.objects.filter(extractor='favicon')   # or leave the default: FaviconArchiveResult.objects.all()
-    """
-    
-    ### Class attributes (defined on the class at compile-time when ActorType[MyModel] is defined)
-    Model: Type[ModelType]
-    StateMachineClass: Type[StateMachine]
-    
-    ACTIVE_STATE: ClassVar[ObjectState] = 'started'
-    EVENT_NAME: ClassVar[str] = 'tick'                                    # the event name to trigger on the obj.sm: StateMachine (usually 'tick')
-    
-    CLAIM_ORDER: ClassVar[tuple[str, ...]] = ('-retry_at',)                # the .order(*args) to claim the queue objects in, use ('?',) for random order
-    CLAIM_FROM_TOP_N: ClassVar[int] = CPU_COUNT * 10                      # the number of objects to consider when atomically getting the next object from the queue
-    CLAIM_ATOMIC: ClassVar[bool] = True                                   # whether to atomically fetch+claim the next object in one query, or fetch and lock it in two queries
-    
-    MAX_TICK_TIME: ClassVar[int] = DEFAULT_MAX_TICK_TIME                  # maximum duration in seconds to process a single object
-    MAX_CONCURRENT_ACTORS: ClassVar[int] = DEFAULT_MAX_CONCURRENT_ACTORS  # maximum number of concurrent actors that can be running at once
-    
-    _SPAWNED_ACTOR_PIDS: ClassVar[list[psutil.Process]] = []      # used to record all the pids of Actors spawned on the class
-    
-    ### Instance attributes (only used within an actor instance inside a spawned actor thread/process)
-    pid: int = os.getpid()
-    idle_count: int = 0
-    launch_kwargs: LaunchKwargs = {}
-    mode: Literal['thread', 'process'] = 'process'
-    
-    def __init_subclass__(cls) -> None:
-        """
-        Executed at class definition time (i.e. during import of any file containing class MyActor(ActorType[MyModel]): ...).
-        Loads the django Model from the Generic[ModelType] TypeVar arg and populates any missing class-level config using it.
-        """
-        if getattr(cls, 'Model', None) is None:
-            cls.Model = cls._get_model_from_generic_typevar()
-        cls._populate_missing_classvars_from_model(cls.Model)
-    
-    def __init__(self, mode: Literal['thread', 'process']|None=None, **launch_kwargs: LaunchKwargs):
-        """
-        Executed right before the Actor is spawned to create a unique Actor instance for that thread/process.
-        actor_instance.runloop() is then executed from inside the newly spawned thread/process.
-        """
-        self.mode = mode or self.mode
-        self.launch_kwargs = launch_kwargs or dict(self.launch_kwargs)
-    
-
-    ### Private Helper Methods: Not desiged to be overridden by subclasses or called by anything outside of this class
-    
-    @classproperty
-    def name(cls) -> str:
-        return cls.__name__  # type: ignore
-    
-    def __str__(self) -> str:
-        return repr(self)
-    
-    def __repr__(self) -> str:
-        """-> FaviconActor[pid=1234]"""
-        label = 'pid' if self.mode == 'process' else 'tid'
-        # return f'[underline]{self.name}[/underline]\\[{label}={self.pid}]'
-        return f'[underline]Worker[/underline]\\[{label}={self.pid}]'
-    
-    @staticmethod
-    def _state_to_str(state: ObjectState) -> str:
-        """Convert a statemachine.State, models.TextChoices.choices value, or Enum value to a str"""
-        return str(state.value) if isinstance(state, State) else str(state)
-    
-    @staticmethod
-    def _sql_for_select_top_n_candidates(qs: QuerySet, claim_from_top_n: int=CLAIM_FROM_TOP_N) -> tuple[str, tuple[Any, ...]]:
-        """Get the SQL for selecting the top N candidates from the queue (to claim one from)"""
-        queryset = qs.only('id')[:claim_from_top_n]
-        select_sql, select_params = compile_sql_select(queryset)
-        return select_sql, select_params
-    
-    @staticmethod
-    def _sql_for_update_claimed_obj(qs: QuerySet, update_kwargs: dict[str, Any]) -> tuple[str, tuple[Any, ...]]:
-        """Get the SQL for updating a claimed object to mark it as ACTIVE"""
-        # qs.update(status='started', retry_at=<now + MAX_TICK_TIME>)
-        update_sql, update_params = compile_sql_update(qs, update_kwargs=update_kwargs)
-        # e.g. UPDATE core_archiveresult SET status='%s', retry_at='%s' WHERE status NOT IN ('succeeded', 'failed', 'sealed', 'started') AND retry_at <= '2024-11-04 10:14:33.240903'
-        return update_sql, update_params
-    
-    @classmethod
-    def _get_model_from_generic_typevar(cls) -> Type[ModelType]:
-        """Get the django Model from the Generic[ModelType] TypeVar arg (and check that it inherits from django.db.models.Model)"""
-        # cls.__orig_bases__ is non-standard and may be removed in the future! if this breaks,
-        # we can just require the inerited class to define the Model as a classvar manually, e.g.:
-        #     class SnapshotActor(ActorType[Snapshot]):
-        #         Model: ClassVar[Type[Snapshot]] = Snapshot
-        # https://stackoverflow.com/questions/57706180/generict-base-class-how-to-get-type-of-t-from-within-instance
-        Model = get_args(cls.__orig_bases__[0])[0]   # type: ignore
-        assert issubclass(Model, DjangoModel), f'{cls.__name__}.Model must be a valid django Model'
-        return cast(Type[ModelType], Model)
-    
-
-    @classmethod
-    def _get_state_machine_instance(cls, obj: ModelType) -> StateMachine:
-        """Get the StateMachine instance for the given django Model instance (and check that it is a valid instance of cls.StateMachineClass)"""
-        obj_statemachine = None
-        state_machine_attr = getattr(obj, 'state_machine_attr', 'sm')
-        try:
-            obj_statemachine = getattr(obj, state_machine_attr)
-        except Exception:
-            pass
-        
-        if not isinstance(obj_statemachine, cls.StateMachineClass):
-            raise Exception(f'{cls.__name__}: Failed to find a valid StateMachine instance at {type(obj).__name__}.{state_machine_attr}')
+# __package__ = 'archivebox.workers'
+
+# import time
+
+
+# from typing import ClassVar, Type, Iterable, TypedDict
+# from django.db.models import QuerySet
+# from django.db import transaction
+# from django.utils import timezone
+# from django.utils.functional import classproperty       # type: ignore
+
+# from .models import Event, Process, EventDict
+
+
+# class ActorType:
+#     # static class attributes
+#     name: ClassVar[str]
+#     event_prefix: ClassVar[str]
+#     poll_interval: ClassVar[int] = 1
+    
+#     @classproperty
+#     def event_queue(cls) -> QuerySet[Event]:
+#         return Event.objects.filter(type__startswith=cls.event_prefix)
+
+#     @classmethod
+#     def fork(cls, wait_for_first_event=False, exit_on_idle=True) -> Process:
+#         cmd = ['archivebox', 'actor', cls.name]
+#         if exit_on_idle:
+#             cmd.append('--exit-on-idle')
+#         if wait_for_first_event:
+#             cmd.append('--wait-for-first-event')
+#         return Process.create_and_fork(cmd=cmd, actor_type=cls.name)
+
+#     @classproperty
+#     def processes(cls) -> QuerySet[Process]:
+#         return Process.objects.filter(actor_type=cls.name)
+
+#     @classmethod
+#     def run(cls, wait_for_first_event=False, exit_on_idle=True):
+
+#         if wait_for_first_event:
+#             event = cls.event_queue.get_next_unclaimed()
+#             while not event:
+#                 time.sleep(cls.poll_interval)
+#                 event = cls.event_queue.get_next_unclaimed()
+
+#         while True:
+#             output_events = list(cls.process_next_event()) or list(cls.process_idle_tick())   # process next event, or tick if idle
+#             yield from output_events
+#             if not output_events:
+#                 if exit_on_idle:
+#                     break
+#                 else:
+#                     time.sleep(cls.poll_interval)
+
+#     @classmethod
+#     def process_next_event(cls) -> Iterable[EventDict]:
+#         event = cls.event_queue.get_next_unclaimed()
+#         output_events = []
+        
+#         if not event:
+#             return []
+        
+#         cls.mark_event_claimed(event, duration=60)
+#         try:
+#             for output_event in cls.receive(event):
+#                 output_events.append(output_event)
+#                 yield output_event
+#             cls.mark_event_succeeded(event, output_events=output_events)
+#         except BaseException as e:
+#             cls.mark_event_failed(event, output_events=output_events, error=e)
+
+#     @classmethod
+#     def process_idle_tick(cls) -> Iterable[EventDict]:
+#         # reset the idle event to be claimed by the current process
+#         event, _created = Event.objects.update_or_create(
+#             name=f'{cls.event_prefix}IDLE',
+#             emitted_by=Process.current(),
+#             defaults={
+#                 'deliver_at': timezone.now(),
+#                 'claimed_proc': None,
+#                 'claimed_at': None,
+#                 'finished_at': None,
+#                 'error': None,
+#                 'parent': None,
+#             },
+#         )
+        
+#         # then process it like any other event
+#         yield from cls.process_next_event()
+
+#     @classmethod
+#     def receive(cls, event: Event) -> Iterable[EventDict]:
+#         handler_method = getattr(cls, f'on_{event.name}', None)
+#         if handler_method:
+#             yield from handler_method(event)
+#         else:
+#             raise Exception(f'No handler method for event: {event.name}')
+
+#     @staticmethod
+#     def on_IDLE() -> Iterable[EventDict]:
+#         return []
+    
+#     @staticmethod
+#     def mark_event_claimed(event: Event, duration: int=60):
+#         proc = Process.current()
+        
+#         with transaction.atomic():
+#             claimed = Event.objects.filter(id=event.id, claimed_proc=None, claimed_at=None).update(claimed_proc=proc, claimed_at=timezone.now())
+#             if not claimed:
+#                 event.refresh_from_db()
+#                 raise Exception(f'Event already claimed by another process: {event.claimed_proc}')
             
             
-        return obj_statemachine
-    
-    @classmethod
-    def _populate_missing_classvars_from_model(cls, Model: Type[ModelType]):
-        """Check that the class variables are set correctly based on the ModelType"""
-        
-        # check that Model is the same as the Generic[ModelType] parameter in the class definition
-        cls.Model = getattr(cls, 'Model', None) or Model
-        if cls.Model != Model:
-            raise ValueError(f'{cls.__name__}.Model must be set to the same Model as the Generic[ModelType] parameter in the class definition')
-        
-        # check that Model has a valid StateMachine with the required event defined on it
-        cls.StateMachineClass = getattr(cls, 'StateMachineClass', None)      # type: ignore
-        assert isinstance(cls.EVENT_NAME, str), f'{cls.__name__}.EVENT_NAME must be a str, got: {type(cls.EVENT_NAME).__name__} instead'
-        assert hasattr(cls.StateMachineClass, cls.EVENT_NAME), f'StateMachine {cls.StateMachineClass.__name__} must define a {cls.EVENT_NAME} event ({cls.__name__}.EVENT_NAME = {cls.EVENT_NAME})'
-        
-        # check that Model uses .id as its primary key field
-        primary_key_field = cls.Model._meta.pk.name
-        if primary_key_field != 'id':
-            raise NotImplementedError(f'Actors currently only support models that use .id as their primary key field ({cls.__name__} uses {cls.__name__}.{primary_key_field} as primary key)')
-        
-        # check that ACTIVE_STATE is defined and that it exists on the StateMachineClass
-        if not getattr(cls, 'ACTIVE_STATE', None):
-            raise NotImplementedError(f'{cls.__name__} must define an ACTIVE_STATE: ClassVar[State] (e.g. SnapshotMachine.started) ({cls.Model.__name__}.{cls.Model.state_field_name} gets set to this value to mark objects as actively processing)')
-        assert isinstance(cls.ACTIVE_STATE, (State, str)) and hasattr(cls.StateMachineClass, cls._state_to_str(cls.ACTIVE_STATE)), f'{cls.__name__}.ACTIVE_STATE must be a statemachine.State | str that exists on {cls.StateMachineClass.__name__}, got: {type(cls.ACTIVE_STATE).__name__} instead'
-        
-        # check the other ClassVar attributes for valid values
-        assert cls.CLAIM_ORDER and isinstance(cls.CLAIM_ORDER, tuple) and all(isinstance(order, str) for order in cls.CLAIM_ORDER), f'{cls.__name__}.CLAIM_ORDER must be a non-empty tuple[str, ...], got: {type(cls.CLAIM_ORDER).__name__} instead'
-        assert cls.CLAIM_FROM_TOP_N > 0, f'{cls.__name__}.CLAIM_FROM_TOP_N must be a positive int, got: {cls.CLAIM_FROM_TOP_N} instead'
-        assert cls.MAX_TICK_TIME >= 1, f'{cls.__name__}.MAX_TICK_TIME must be a positive int > 1, got: {cls.MAX_TICK_TIME} instead'
-        assert cls.MAX_CONCURRENT_ACTORS >= 1, f'{cls.__name__}.MAX_CONCURRENT_ACTORS must be a positive int >=1, got: {cls.MAX_CONCURRENT_ACTORS} instead'
-        assert isinstance(cls.CLAIM_ATOMIC, bool), f'{cls.__name__}.CLAIM_ATOMIC must be a bool, got: {cls.CLAIM_ATOMIC} instead'
-
-    # @classmethod
-    # def _fork_actor_as_thread(cls, **launch_kwargs: LaunchKwargs) -> int:
-    #     """Spawn a new background thread running the actor's runloop"""
-    #     actor = cls(mode='thread', **launch_kwargs)
-    #     bg_actor_thread = Thread(target=actor.runloop)
-    #     bg_actor_thread.start()
-    #     assert bg_actor_thread.native_id is not None
-    #     return bg_actor_thread.native_id
-    
-    @classmethod
-    def _fork_actor_as_process(cls, **launch_kwargs: LaunchKwargs) -> int:
-        """Spawn a new background process running the actor's runloop"""
-        actor = cls(mode='process', **launch_kwargs)
-        bg_actor_process = Process(target=actor.runloop)
-        bg_actor_process.start()
-        assert bg_actor_process.pid is not None
-        cls._SPAWNED_ACTOR_PIDS.append(psutil.Process(pid=bg_actor_process.pid))
-        return bg_actor_process.pid
-    
-    @classmethod
-    def _obj_repr(cls, obj: ModelType | Any) -> str:
-        """Get a string representation of the given django Model instance"""
-        return f'[grey53]{type(obj).__name__}\\[{obj.ABID}][/grey53]'
-    
-    ### Class Methods: Called by Orchestrator on ActorType class before it has been spawned
-    
-    @classmethod
-    def get_running_actors(cls) -> list[int]:
-        """returns a list of pids of all running actors of this type"""
-        # WARNING: only works for process actors, not thread actors
-        if cls.mode == 'thread':
-            raise NotImplementedError('get_running_actors() is not implemented for thread actors')
-        return [
-            proc.pid for proc in cls._SPAWNED_ACTOR_PIDS
-            if proc.is_running() and proc.status() != 'zombie'
-        ]
-        
-    @classmethod
-    def get_actors_to_spawn(cls, queue: QuerySet, running_actors: list[int]) -> list[LaunchKwargs]:
-        """Get a list of launch kwargs for the number of actors to spawn based on the queue and currently running actors"""
-        queue_length = queue.count()
-        if not queue_length:                                      # queue is empty, spawn 0 actors
-            return []
-        
-        # WARNING:
-        # spawning new actors processes is slow/expensive, avoid spawning many actors at once in a single orchestrator tick.
-        # limit to spawning 1 or 2 at a time per orchestrator tick, and let the next tick handle starting another couple.
-        # DONT DO THIS:
-        # if queue_length > 20:                      # queue is extremely long, spawn maximum actors at once!
-        #   num_to_spawn_this_tick = cls.MAX_CONCURRENT_ACTORS
-        
-        if queue_length > 10:    
-            num_to_spawn_this_tick = 2  # spawn more actors per tick if queue is long
-        else:
-            num_to_spawn_this_tick = 1  # spawn fewer actors per tick if queue is short
-        
-        num_remaining = cls.MAX_CONCURRENT_ACTORS - len(running_actors)
-        num_to_spawn_now: int = limit(num_to_spawn_this_tick, num_remaining)
-        
-        actors_launch_kwargs: list[LaunchKwargs] = num_to_spawn_now * [{**cls.launch_kwargs}]
-        return actors_launch_kwargs
-        
-    @classmethod
-    def start(cls, mode: Literal['thread', 'process']='process', **launch_kwargs: LaunchKwargs) -> int:
-        if mode == 'thread':
-            raise NotImplementedError('Thread-based actors are disabled to reduce codebase complexity. Please use processes for everything')
-            # return cls._fork_actor_as_thread(**launch_kwargs)
-        elif mode == 'process':
-            return cls._fork_actor_as_process(**launch_kwargs)
-        raise ValueError(f'Invalid actor mode: {mode} must be "thread" or "process"')
-    
-    @classproperty
-    def qs(cls) -> QuerySet[ModelType]:
-        """
-        Get the unfiltered and unsorted QuerySet of all objects that this Actor might care about.
-        Override this in the subclass to define the QuerySet of objects that the Actor is going to poll for new work.
-        (don't limit, order, or filter this by retry_at or status yet, Actor.get_queue() handles that part)
-        """
-        return cls.Model.objects.filter()
-    
-    @classproperty
-    def final_q(cls) -> Q:
-        """Get the filter for objects that are already completed / in a final state"""
-        return Q(**{
-            f'{cls.Model.state_field_name}__in': [cls._state_to_str(s) for s in cls.StateMachineClass.final_states],
-        })  # status__in=('sealed', 'failed', 'succeeded')
-    
-    @classproperty
-    def active_q(cls) -> Q:
-        """Get the filter for objects that are marked active (and are still running / not timed out)"""
-        return Q(retry_at__gte=timezone.now(), **{cls.Model.state_field_name: cls._state_to_str(cls.ACTIVE_STATE)})   # e.g. Q(status='started')
-    
-    @classproperty
-    def stalled_q(cls) -> Q:
-        """Get the filter for objects that are marked active but are timed out"""
-        return Q(retry_at__lte=timezone.now(), **{cls.Model.state_field_name: cls._state_to_str(cls.ACTIVE_STATE)})                     # e.g. Q(status='started') AND Q(<retry_at is in the past>)
-    
-    @classproperty
-    def future_q(cls) -> Q:
-        """Get the filter for objects that have a retry_at in the future"""
-        return Q(retry_at__gt=timezone.now(), **{cls.Model.state_field_name: 'QUEUED'})
-    
-    @classproperty
-    def pending_q(cls) -> Q:
-        """Get the filter for objects that are ready for processing."""
-        return ~Q(**{
-            f'{cls.Model.state_field_name}__in': (*[cls._state_to_str(s) for s in cls.StateMachineClass.final_states], cls._state_to_str(cls.ACTIVE_STATE))
-        })  # status__not_in=('sealed', 'failed', 'succeeded', 'started')
-    
-    @classmethod
-    def get_queue(cls, sort: bool=True) -> QuerySet[ModelType]:
-        """
-        Get the sorted and filtered QuerySet of objects that are ready for processing.
-        e.g. qs.exclude(status__in=('sealed', 'started'), retry_at__gt=timezone.now()).order_by('retry_at')
-        """
-        unsorted_qs = cls.qs.filter(cls.pending_q) | cls.qs.filter(cls.stalled_q)
-        return unsorted_qs.order_by(*cls.CLAIM_ORDER) if sort else unsorted_qs
-
-    ### Instance Methods: Only called from within Actor instance after it has been spawned (i.e. forked as a thread or process)
-    
-    def runloop(self):
-        """The main runloop that starts running when the actor is spawned (as subprocess or thread) and exits when the queue is empty"""
-        self.on_startup()
-        obj_to_process: ModelType | None = None
-        last_error: BaseException | None = None
-        try:
-            while True:
-                # Get the next object to process from the queue
-                try:
-                    obj_to_process = cast(ModelType, self.get_next(atomic=self.CLAIM_ATOMIC))
-                except (ActorQueueIsEmpty, ActorObjectAlreadyClaimed) as err:
-                    last_error = err
-                    obj_to_process = None
-                
-                # Handle the case where there is no next object to process
-                if obj_to_process:
-                    self.idle_count = 0   # reset idle count if we got an object
-                else:
-                    if self.idle_count >= 3:
-                        break             # stop looping and exit if queue is empty and we have idled for 30sec
-                    else:
-                        # print('Actor runloop()', f'pid={self.pid}', 'queue empty, rechecking...')
-                        self.idle_count += 1
-                        time.sleep(1)
-                        continue
-                
-                # Process the object by triggering its StateMachine.tick() method
-                self.on_tick_start(obj_to_process)
-                try:
-                    self.tick(obj_to_process)
-                except Exception as err:
-                    last_error = err
-                    print(f'[red]{self._obj_repr(obj_to_process)} 🏃‍♂️ {self}.tick()[/red] ERROR: [red]{type(err).__name__}: {err}[/red]')
-                    db.connections.close_all()                         # always reset the db connection after an exception to clear any pending transactions
-                    self.on_tick_exception(obj_to_process, err)
-                    traceback.print_exc()
-                finally:
-                    self.on_tick_end(obj_to_process)
-
-        except BaseException as err:
-            last_error = err
-            if isinstance(err, KeyboardInterrupt):
-                print()
-            else:
-                print(f'\n[red]{self._obj_repr(obj_to_process)} 🏃‍♂️ {self}.runloop() FATAL:[/red] {type(err).__name__}: {err}')
-                print(f'    Last processed object: {obj_to_process}')
-                raise
-        finally:
-            self.on_shutdown(last_obj=obj_to_process, last_error=last_error)
-    
-    @classmethod
-    def get_update_kwargs_to_claim_obj(cls) -> dict[str, Any]:
-        """
-        Get the field values needed to mark an pending obj_to_process as being actively processing (aka claimed)
-        by the current Actor. returned kwargs will be applied using: qs.filter(id=obj_to_process.id).update(**kwargs).
-        F() expressions are allowed in field values if you need to update a field based on its current value.
-        Can be a defined as a normal method (instead of classmethod) on subclasses if it needs to access instance vars.
-        """
-        return {
-            # cls.Model.state_field_name: cls._state_to_str(cls.ACTIVE_STATE),   # do this manually in the state machine enter hooks
-            'retry_at': timezone.now() + timedelta(seconds=cls.MAX_TICK_TIME),
-        }
-    
-    def get_next(self, atomic: bool | None=None) -> ModelType | None:
-        """get the next object from the queue, atomically locking it if self.CLAIM_ATOMIC=True"""
-        atomic = self.CLAIM_ATOMIC if atomic is None else atomic
-        if atomic:
-            # fetch and claim the next object from in the queue in one go atomically
-            obj = self.get_next_atomic()
-        else:
-            # two-step claim: fetch the next object and lock it in a separate query
-            obj = self.get_next_non_atomic()
-        return obj
-    
-    def get_next_non_atomic(self) -> ModelType:
-        """
-        Naiively selects the top/first object from self.get_queue().order_by(*self.CLAIM_ORDER),
-        then claims it by running .update(status='started', retry_at=<now + MAX_TICK_TIME>).
-        
-        Do not use this method if there is more than one Actor racing to get objects from the same queue,
-        it will be slow/buggy as they'll compete to lock the same object at the same time (TOCTTOU race).
-        """
-        obj = self.get_queue().first()
-        if obj is None:
-            raise ActorQueueIsEmpty(f'No next object available in {self}.get_queue()')
-        
-        locked = self.get_queue().filter(id=obj.id).update(**self.get_update_kwargs_to_claim_obj())
-        if not locked:
-            raise ActorObjectAlreadyClaimed(f'Unable to lock the next {self.Model.__name__} object from {self}.get_queue().first()')
-        return obj
-        
-    def get_next_atomic(self) -> ModelType | None:
-        """
-        Selects the top n=50 objects from the queue and atomically claims a random one from that set.
-        This approach safely minimizes contention with other Actors trying to select from the same Queue.
-
-        The atomic query is roughly equivalent to the following:  (all done in one SQL query to avoid a TOCTTOU race)
-            top_candidates are selected from:   qs.order_by(*CLAIM_ORDER).only('id')[:CLAIM_FROM_TOP_N]
-            a single candidate is chosen using: qs.filter(id__in=top_n_candidates).order_by('?').first()
-            the chosen obj is claimed using:    qs.filter(id=chosen_obj).update(status=ACTIVE_STATE, retry_at=<now + MAX_TICK_TIME>)
-        """
-        # TODO: if we switch from SQLite to PostgreSQL in the future, we should change this
-        # to use SELECT FOR UPDATE instead of a subquery + ORDER BY RANDOM() LIMIT 1
-        
-        # e.g. SELECT id FROM core_archiveresult WHERE status NOT IN (...) AND retry_at <= '...' ORDER BY retry_at ASC LIMIT 50
-        qs = self.get_queue()
-        select_top_canidates_sql, select_params = self._sql_for_select_top_n_candidates(qs=qs)
-        assert select_top_canidates_sql.startswith('SELECT ')
-        
-        # e.g. UPDATE core_archiveresult SET status='%s', retry_at='%s' WHERE status NOT IN (...) AND retry_at <= '...'
-        update_claimed_obj_sql, update_params = self._sql_for_update_claimed_obj(qs=self.qs.all(), update_kwargs=self.get_update_kwargs_to_claim_obj())
-        assert update_claimed_obj_sql.startswith('UPDATE ') and 'WHERE' not in update_claimed_obj_sql
-        db_table = self.Model._meta.db_table  # e.g. core_archiveresult
-        
-        # subquery gets the pool of the top candidates e.g. self.get_queue().only('id')[:CLAIM_FROM_TOP_N]
-        # main query selects a random one from that pool, and claims it using .update(status=ACTIVE_STATE, retry_at=<now + MAX_TICK_TIME>)
-        # this is all done in one atomic SQL query to avoid TOCTTOU race conditions (as much as possible)
-        atomic_select_and_update_sql = f"""
-            with top_candidates AS ({select_top_canidates_sql})
-            {update_claimed_obj_sql}
-            WHERE "{db_table}"."id" IN (
-                SELECT id FROM top_candidates
-                ORDER BY RANDOM()
-                LIMIT 1
-            )
-            RETURNING *;
-        """
-        
-        # import ipdb; ipdb.set_trace()
-
-        try:
-            updated = qs.raw(atomic_select_and_update_sql, (*select_params, *update_params))
-            assert len(updated) <= 1, f'Expected to claim at most 1 object, but Django modified {len(updated)} objects!'
-            return updated[0]
-        except IndexError:
-            if self.get_queue().exists():
-                raise ActorObjectAlreadyClaimed(f'Unable to lock the next {self.Model.__name__} object from {self}.get_queue().first()')
-            else:
-                raise ActorQueueIsEmpty(f'No next object available in {self}.get_queue()')
-
-    def tick(self, obj_to_process: ModelType) -> None:
-        """Call the object.sm.tick() method to process the object"""
-        print(f'\n[grey53]{self._obj_repr(obj_to_process)} 🏃‍♂️ {self}.tick()[/grey53] [blue]{obj_to_process.status.upper()}[/blue] ➡️ ...  +{(obj_to_process.retry_at - timezone.now()).total_seconds() if obj_to_process.retry_at else "-"}s')
-        
-        # get the StateMachine instance from the object
-        obj_statemachine = self._get_state_machine_instance(obj_to_process)
-        starting_state = obj_statemachine.current_state
-        
-        # trigger the event on the StateMachine instance
-        obj_tick_method = getattr(obj_statemachine, self.EVENT_NAME)  # e.g. obj_statemachine.tick()
-        obj_tick_method()
-        
-        ending_state = obj_statemachine.current_state
-        if starting_state != ending_state:
-            self.on_state_change(obj_to_process, starting_state, ending_state)
-        
-        # save the object to persist any state changes
-        obj_to_process.save()
-        
-    def on_startup(self) -> None:
-        if self.mode == 'thread':
-            # self.pid = get_native_id()  # thread id
-            print(f'[green]🏃‍♂️ {self}.on_startup() STARTUP (THREAD)[/green]')
-            raise NotImplementedError('Thread-based actors are disabled to reduce codebase complexity. Please use processes for everything')
-        else:
-            self.pid = os.getpid()      # process id
-            print(f'[green]🏃‍♂️ {self}.on_startup() STARTUP (PROCESS)[/green]')
-        # abx.pm.hook.on_actor_startup(actor=self)
-        
-    def on_shutdown(self, last_obj: ModelType | None=None, last_error: BaseException | None=None) -> None:
-        # if isinstance(last_error, KeyboardInterrupt) or last_error is None:
-        #     last_error_str = '[green](CTRL-C)[/green]'
-        # elif isinstance(last_error, ActorQueueIsEmpty):
-        #     last_error_str = '[green](queue empty)[/green]'
-        # elif isinstance(last_error, ActorObjectAlreadyClaimed):
-        #     last_error_str = '[green](queue race)[/green]'
-        # else:
-        #     last_error_str = f'[red]{type(last_error).__name__}: {last_error}[/red]'
-
-        # print(f'[grey53]🏃‍♂️ {self}.on_shutdown() SHUTTING DOWN[/grey53] {last_error_str}')
-        # abx.pm.hook.on_actor_shutdown(actor=self, last_obj=last_obj, last_error=last_error)
-        pass
-        
-    def on_tick_start(self, obj_to_process: ModelType) -> None:
-        # print(f'🏃‍♂️ {self}.on_tick_start() {obj_to_process.ABID} {obj_to_process.status} {obj_to_process.retry_at}')
-        # abx.pm.hook.on_actor_tick_start(actor=self, obj_to_process=obj)
-        # self.timer = TimedProgress(self.MAX_TICK_TIME, prefix='      ')
-        pass
-    
-    def on_tick_end(self, obj_to_process: ModelType) -> None:
-        # print(f'🏃‍♂️ {self}.on_tick_end() {obj_to_process.ABID} {obj_to_process.status} {obj_to_process.retry_at}')
-        # abx.pm.hook.on_actor_tick_end(actor=self, obj_to_process=obj_to_process)
-        # self.timer.end()
-        pass
-        
-        # import ipdb; ipdb.set_trace()
-
-    
-    def on_tick_exception(self, obj_to_process: ModelType, error: Exception) -> None:
-        print(f'[red]{self._obj_repr(obj_to_process)} 🏃‍♂️ {self}.on_tick_exception()[/red] [blue]{obj_to_process.status}[/blue] +{(obj_to_process.retry_at - timezone.now()).total_seconds() if obj_to_process.retry_at else "-"}s: [red]{type(error).__name__}: {error}[/red]')
-        # abx.pm.hook.on_actor_tick_exception(actor=self, obj_to_process=obj_to_process, error=error)
-
-    def on_state_change(self, obj_to_process: ModelType, starting_state, ending_state) -> None:
-        print(f'[blue]{self._obj_repr(obj_to_process)} 🏃‍♂️ {self}.on_state_change() {starting_state} ➡️ {ending_state}[/blue] +{(obj_to_process.retry_at - timezone.now()).total_seconds() if obj_to_process.retry_at else "-"}s')
-        # abx.pm.hook.on_actor_state_change(actor=self, obj_to_process=obj_to_process, starting_state=starting_state, ending_state=ending_state)
-
-
-def compile_sql_select(queryset: QuerySet, filter_kwargs: dict[str, Any] | None=None, order_args: tuple[str, ...]=(), limit: int | None=None) -> tuple[str, tuple[Any, ...]]:
-    """
-    Compute the SELECT query SQL for a queryset.filter(**filter_kwargs).order_by(*order_args)[:limit] call
-    Returns a tuple of (sql, params) where sql is a template string containing %s (unquoted) placeholders for the params
-    
-    WARNING:
-    final_sql = sql % params  DOES NOT WORK to assemble the final SQL string because the %s placeholders are not quoted/escaped
-    they should always passed separately to the DB driver so it can do its own quoting/escaping to avoid SQL injection and syntax errors
-    """
-    assert isinstance(queryset, QuerySet), f'compile_sql_select(...) first argument must be a QuerySet, got: {type(queryset).__name__} instead'
-    assert filter_kwargs is None or isinstance(filter_kwargs, dict), f'compile_sql_select(...) filter_kwargs argument must be a dict[str, Any], got: {type(filter_kwargs).__name__} instead'
-    assert isinstance(order_args, tuple) and all(isinstance(arg, str) for arg in order_args), f'compile_sql_select(...) order_args argument must be a tuple[str, ...] got: {type(order_args).__name__} instead'
-    assert limit is None or isinstance(limit, int), f'compile_sql_select(...) limit argument must be an int, got: {type(limit).__name__} instead'
-    
-    queryset = queryset._chain()                      # type: ignore   # copy queryset to avoid modifying the original
-    if filter_kwargs:
-        queryset = queryset.filter(**filter_kwargs)
-    if order_args:
-        queryset = queryset.order_by(*order_args)
-    if limit is not None:
-        queryset = queryset[:limit]
-    query = queryset.query
-    
-    # e.g. SELECT id FROM core_archiveresult WHERE status NOT IN (%s, %s, %s) AND retry_at <= %s ORDER BY retry_at ASC LIMIT 50
-    select_sql, select_params = query.get_compiler(queryset.db).as_sql()
-    return select_sql, select_params
-
-
-def compile_sql_update(queryset: QuerySet, update_kwargs: dict[str, Any]) -> tuple[str, tuple[Any, ...]]:
-    """
-    Compute the UPDATE query SQL for a queryset.filter(**filter_kwargs).update(**update_kwargs) call
-    Returns a tuple of (sql, params) where sql is a template string containing %s (unquoted) placeholders for the params
-    
-    Based on the django.db.models.QuerySet.update() source code, but modified to return the SQL instead of executing the update
-    https://github.com/django/django/blob/611bf6c2e2a1b4ab93273980c45150c099ab146d/django/db/models/query.py#L1217
-    
-    WARNING:
-    final_sql = sql % params  DOES NOT WORK to assemble the final SQL string because the %s placeholders are not quoted/escaped
-    they should always passed separately to the DB driver so it can do its own quoting/escaping to avoid SQL injection and syntax errors
-    """
-    assert isinstance(queryset, QuerySet), f'compile_sql_update(...) first argument must be a QuerySet, got: {type(queryset).__name__} instead'
-    assert isinstance(update_kwargs, dict), f'compile_sql_update(...) update_kwargs argument must be a dict[str, Any], got: {type(update_kwargs).__name__} instead'
-    
-    queryset = queryset._chain().all()                # type: ignore   # copy queryset to avoid modifying the original and clear any filters
-    queryset.query.clear_ordering(force=True)                          # clear any ORDER BY clauses
-    queryset.query.clear_limits()                                      # clear any LIMIT clauses aka slices[:n]
-    queryset._for_write = True                        # type: ignore
-    query = queryset.query.chain(sql.UpdateQuery)     # type: ignore
-    query.add_update_values(update_kwargs)            # type: ignore
-    query.annotations = {}                                             # clear any annotations
-    
-    # e.g. UPDATE core_archiveresult SET status='%s', retry_at='%s' WHERE status NOT IN (%s, %s, %s) AND retry_at <= %s
-    update_sql, update_params = query.get_compiler(queryset.db).as_sql()
-    
-    # make sure you only pass a raw queryset with no .filter(...) clauses applied to it, the return value is designed to used
-    # in a manually assembled SQL query with its own WHERE clause later on
-    assert 'WHERE' not in update_sql, f'compile_sql_update(...) should only contain a SET statement but it tried to return a query with a WHERE clause: {update_sql}'
-    
-    # print(update_sql, update_params)
-
-    return update_sql, update_params
+#             process_updated = Process.objects.filter(id=proc.id, active_event=None).update(active_event=event)
+#             if not process_updated:
+#                 raise Exception(f'Unable to update process.active_event: {proc}.active_event = {event}')
+
+#     @staticmethod
+#     def mark_event_succeeded(event: Event, output_events: Iterable[EventDict]):
+#         assert event.claimed_proc and (event.claimed_proc == Process.current())
+#         with transaction.atomic():
+#             updated = Event.objects.filter(id=event.id, claimed_proc=event.claimed_proc, claimed_at=event.claimed_at, finished_at=None).update(finished_at=timezone.now())
+#             if not updated:
+#                 event.refresh_from_db()
+#                 raise Exception(f'Event {event} failed to mark as succeeded, it was modified by another process: {event.claimed_proc}')
+
+#             process_updated = Process.objects.filter(id=event.claimed_proc.id, active_event=event).update(active_event=None)
+#             if not process_updated:
+#                 raise Exception(f'Unable to unset process.active_event: {event.claimed_proc}.active_event = {event}')
+
+#         # dispatch any output events
+#         for output_event in output_events:
+#             Event.dispatch(event=output_event, parent=event)
+
+#         # trigger any callback events
+#         if event.on_success:
+#             Event.dispatch(event=event.on_success, parent=event)
+
+#     @staticmethod
+#     def mark_event_failed(event: Event, output_events: Iterable[EventDict]=(), error: BaseException | None = None):
+#         assert event.claimed_proc and (event.claimed_proc == Process.current())
+#         with transaction.atomic():
+#             updated = event.objects.filter(id=event.id, claimed_proc=event.claimed_proc, claimed_at=event.claimed_at, finished_at=None).update(finished_at=timezone.now(), error=str(error))
+#             if not updated:
+#                 event.refresh_from_db()
+#                 raise Exception(f'Event {event} failed to mark as failed, it was modified by another process: {event.claimed_proc}')
+
+#             process_updated = Process.objects.filter(id=event.claimed_proc.id, active_event=event).update(active_event=None)
+#             if not process_updated:
+#                 raise Exception(f'Unable to unset process.active_event: {event.claimed_proc}.active_event = {event}')
+
+        
+#         # add dedicated error event to the output events
+#         output_events = [
+#             *output_events,
+#             {'name': f'{event.name}_ERROR', 'error': f'{type(error).__name__}: {error}'},
+#         ]
+        
+#         # dispatch any output events
+#         for output_event in output_events:
+#             Event.dispatch(event=output_event, parent=event)
+        
+#         # trigger any callback events
+#         if event.on_failure:
+#             Event.dispatch(event=event.on_failure, parent=event)