1 year ago · 2a1afcf6c2
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -41,6 +41,7 @@ from workers.tasks import bg_archive_snapshot
 
				 from tags.models import KVTag
			
 
				 # from machine.models import Machine, NetworkInterface
			
 
				 
			
 
				+from crawls.models import Seed, Crawl, CrawlSchedule
			
 
				 
			
 
				 
			
 
				 class Tag(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ABIDModel):
			
@@ -133,403 +134,6 @@ class SnapshotTag(models.Model):
 
				         unique_together = [('snapshot', 'tag')]
			
 
				 
			
 
				 
			
 
				-class Seed(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ModelWithKVTags, ABIDModel, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats):
			
 
				-    """
			
 
				-    A fountain that produces URLs (+metadata) each time it's queried e.g.
			
 
				-        - file:///data/sources/2024-01-02_11-57-51__cli_add.txt
			
 
				-        - file:///data/sources/2024-01-02_11-57-51__web_ui_add.txt
			
 
				-        - file:///Users/squash/Library/Application Support/Google/Chrome/Default/Bookmarks
			
 
				-        - https://getpocket.com/user/nikisweeting/feed
			
 
				-        - https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
			
 
				-        - ...
			
 
				-    Each query of a Seed can produce the same list of URLs, or a different list each time.
			
 
				-    The list of URLs it returns is used to create a new Crawl and seed it with new pending Snapshots.
			
 
				-        
			
 
				-    When a crawl is created, a root_snapshot is initially created with a URI set to the Seed URI.
			
 
				-    The seed's preferred extractor is executed on that URI, which produces an ArchiveResult containing outlinks.
			
 
				-    The outlinks then get turned into new pending Snapshots under the same crawl,
			
 
				-    and the cycle repeats until Crawl.max_depth.
			
 
				-
			
 
				-    Each consumption of a Seed by an Extractor can produce new urls, as Seeds can point to
			
 
				-    stateful remote services, files with contents that change, directories that have new files within, etc.
			
 
				-    """
			
 
				-    
			
 
				-    ### ModelWithReadOnlyFields:
			
 
				-    read_only_fields = ('id', 'abid', 'created_at', 'created_by', 'uri')
			
 
				-    
			
 
				-    ### Immutable fields
			
 
				-    id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
			
 
				-    abid = ABIDField(prefix=abid_prefix)
			
 
				-    created_at = AutoDateTimeField(default=None, null=False, db_index=True)                  # unique source location where URLs will be loaded from
			
 
				-    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
			
 
				-    
			
 
				-    ### Mutable fields:
			
 
				-    extractor = models.CharField(default='auto', max_length=32, help_text='The parser / extractor to use to load URLs from this source (default: auto)')
			
 
				-    tags_str = models.CharField(max_length=255, null=False, blank=True, default='', help_text='An optional comma-separated list of tags to attach to any URLs that come from this source')
			
 
				-    label = models.CharField(max_length=255, null=False, blank=True, default='', help_text='A human-readable label for this seed')
			
 
				-    modified_at = models.DateTimeField(auto_now=True)
			
 
				-
			
 
				-    ### ModelWithConfig:
			
 
				-    config = models.JSONField(default=dict, help_text='An optional JSON object containing extra config to put in scope when loading URLs from this source')
			
 
				-
			
 
				-    ### ModelWithOutputDir:
			
 
				-    output_dir = models.CharField(max_length=255, null=False, blank=True, default='', help_text='The directory to store the output of this seed')
			
 
				-
			
 
				-    ### ModelWithNotes:
			
 
				-    notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this seed should have')
			
 
				-
			
 
				-    ### ModelWithKVTags:
			
 
				-    tag_set = GenericRelation(
			
 
				-        KVTag,
			
 
				-        related_query_name="seed",
			
 
				-        content_type_field="obj_type",
			
 
				-        object_id_field="obj_id",
			
 
				-        order_by=('name',),
			
 
				-    )
			
 
				-    
			
 
				-    ### ABIDModel:
			
 
				-    abid_prefix = 'src_'
			
 
				-    abid_ts_src = 'self.created_at'
			
 
				-    abid_uri_src = 'self.uri'
			
 
				-    abid_subtype_src = 'self.extractor'
			
 
				-    abid_rand_src = 'self.id'
			
 
				-    abid_drift_allowed = True
			
 
				-    
			
 
				-    ### Managers:
			
 
				-    crawl_set: models.Manager['Crawl']
			
 
				-
			
 
				-    class Meta:
			
 
				-        verbose_name = 'Seed'
			
 
				-        verbose_name_plural = 'Seeds'
			
 
				-        
			
 
				-        unique_together = (('created_by', 'uri', 'extractor'),('created_by', 'label'))
			
 
				-
			
 
				-
			
 
				-    @classmethod
			
 
				-    def from_file(cls, source_file: Path, label: str='', parser: str='auto', tag: str='', created_by: int|None=None, config: dict|None=None):
			
 
				-        source_path = str(source_file.resolve()).replace(str(CONSTANTS.DATA_DIR), '/data')
			
 
				-        
			
 
				-        seed, _ = cls.objects.get_or_create(
			
 
				-            label=label or source_file.name,
			
 
				-            uri=f'file://{source_path}',
			
 
				-            created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
			
 
				-            extractor=parser,
			
 
				-            tags_str=tag,
			
 
				-            config=config or {},
			
 
				-        )
			
 
				-        seed.save()
			
 
				-        return seed
			
 
				-
			
 
				-    @property
			
 
				-    def source_type(self):
			
 
				-        # e.g. http/https://
			
 
				-        #      file://
			
 
				-        #      pocketapi://
			
 
				-        #      s3://
			
 
				-        #      etc..
			
 
				-        return self.uri.split('://', 1)[0].lower()
			
 
				-
			
 
				-    @property
			
 
				-    def api_url(self) -> str:
			
 
				-        # /api/v1/core/seed/{uulid}
			
 
				-        return reverse_lazy('api-1:get_seed', args=[self.abid])  # + f'?api_key={get_or_create_api_token(request.user)}'
			
 
				-
			
 
				-    @property
			
 
				-    def api_docs_url(self) -> str:
			
 
				-        return '/api/v1/docs#/Core%20Models/api_v1_core_get_seed'
			
 
				-
			
 
				-    @property
			
 
				-    def scheduled_crawl_set(self) -> QuerySet['CrawlSchedule']:
			
 
				-        from crawls.models import CrawlSchedule
			
 
				-        return CrawlSchedule.objects.filter(template__seed_id=self.pk)
			
 
				-
			
 
				-    @property
			
 
				-    def snapshot_set(self) -> QuerySet['Snapshot']:
			
 
				-        from core.models import Snapshot
			
 
				-        
			
 
				-        crawl_ids = self.crawl_set.values_list('pk', flat=True)
			
 
				-        return Snapshot.objects.filter(crawl_id__in=crawl_ids)
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				-class CrawlSchedule(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ModelWithKVTags, ABIDModel, ModelWithNotes, ModelWithHealthStats):
			
 
				-    """
			
 
				-    A record for a job that should run repeatedly on a given schedule.
			
 
				-    
			
 
				-    It pulls from a given Seed and creates a new Crawl for each scheduled run.
			
 
				-    The new Crawl will inherit all the properties of the crawl_template Crawl.
			
 
				-    """
			
 
				-    ### ModelWithReadOnlyFields:
			
 
				-    read_only_fields = ('id', 'abid', 'created_at', 'created_by', 'template_id')
			
 
				-    
			
 
				-    ### Immutable fields:
			
 
				-    id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
			
 
				-    abid = ABIDField(prefix=abid_prefix)
			
 
				-    created_at = AutoDateTimeField(default=None, null=False, db_index=True)
			
 
				-    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
			
 
				-    template: 'Crawl' = models.ForeignKey('Crawl', on_delete=models.CASCADE, null=False, blank=False, help_text='The base crawl that each new scheduled job should copy as a template')  # type: ignore
			
 
				-    
			
 
				-    ### Mutable fields
			
 
				-    schedule = models.CharField(max_length=64, blank=False, null=False, help_text='The schedule to run this crawl on in CRON syntax e.g. 0 0 * * * (see https://crontab.guru/)')
			
 
				-    is_enabled = models.BooleanField(default=True)
			
 
				-    label = models.CharField(max_length=64, blank=True, null=False, default='', help_text='A human-readable label for this scheduled crawl')
			
 
				-    notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this crawl should have')
			
 
				-    modified_at = models.DateTimeField(auto_now=True)
			
 
				-    
			
 
				-    ### ModelWithKVTags:
			
 
				-    tag_set = GenericRelation(
			
 
				-        KVTag,
			
 
				-        related_query_name="crawlschedule",
			
 
				-        content_type_field="obj_type",
			
 
				-        object_id_field="obj_id",
			
 
				-        order_by=('name',),
			
 
				-    )
			
 
				-    
			
 
				-    ### ABIDModel:
			
 
				-    abid_prefix = 'cws_'
			
 
				-    abid_ts_src = 'self.created_at'
			
 
				-    abid_uri_src = 'self.template.seed.uri'
			
 
				-    abid_subtype_src = 'self.template.persona'
			
 
				-    abid_rand_src = 'self.id'
			
 
				-    abid_drift_allowed = True
			
 
				-    
			
 
				-    ### Managers:
			
 
				-    crawl_set: models.Manager['Crawl']
			
 
				-    
			
 
				-    class Meta(TypedModelMeta):
			
 
				-        verbose_name = 'Scheduled Crawl'
			
 
				-        verbose_name_plural = 'Scheduled Crawls'
			
 
				-        
			
 
				-    def __str__(self) -> str:
			
 
				-        uri = (self.template and self.template.seed and self.template.seed.uri) or '<no url set>'
			
 
				-        crawl_label = self.label or (self.template and self.template.seed and self.template.seed.label) or 'Untitled Crawl'
			
 
				-        if self.id and self.template:
			
 
				-            return f'[{self.ABID}] {uri[:64]} @ {self.schedule} (Scheduled {crawl_label})'
			
 
				-        return f'[{self.abid_prefix}****not*saved*yet****] {uri[:64]} @ {self.schedule} (Scheduled {crawl_label})'
			
 
				-    
			
 
				-    @property
			
 
				-    def api_url(self) -> str:
			
 
				-        # /api/v1/core/crawlschedule/{uulid}
			
 
				-        return reverse_lazy('api-1:get_any', args=[self.abid])  # + f'?api_key={get_or_create_api_token(request.user)}'
			
 
				-
			
 
				-    @property
			
 
				-    def api_docs_url(self) -> str:
			
 
				-        return '/api/v1/docs#/Core%20Models/api_v1_core_get_any'
			
 
				-    
			
 
				-    def save(self, *args, **kwargs):
			
 
				-        self.label = self.label or self.template.seed.label or self.template.seed.uri
			
 
				-        super().save(*args, **kwargs)
			
 
				-        
			
 
				-        # make sure the template crawl points to this schedule as its schedule
			
 
				-        self.template.schedule = self
			
 
				-        self.template.save()
			
 
				-        
			
 
				-    @property
			
 
				-    def snapshot_set(self) -> QuerySet['Snapshot']:
			
 
				-        from core.models import Snapshot
			
 
				-        
			
 
				-        crawl_ids = self.crawl_set.values_list('pk', flat=True)
			
 
				-        return Snapshot.objects.filter(crawl_id__in=crawl_ids)
			
 
				-    
			
 
				-
			
 
				-class CrawlManager(models.Manager):
			
 
				-    pass
			
 
				-
			
 
				-class CrawlQuerySet(models.QuerySet):
			
 
				-    """
			
 
				-    Enhanced QuerySet for Crawl that adds some useful methods.
			
 
				-    
			
 
				-    To get all the snapshots for a given set of Crawls:
			
 
				-        Crawl.objects.filter(seed__uri='https://example.com/some/rss.xml').snapshots() -> QuerySet[Snapshot]
			
 
				-    
			
 
				-    To get all the archiveresults for a given set of Crawls:
			
 
				-        Crawl.objects.filter(seed__uri='https://example.com/some/rss.xml').archiveresults() -> QuerySet[ArchiveResult]
			
 
				-    
			
 
				-    To export the list of Crawls as a CSV or JSON:
			
 
				-        Crawl.objects.filter(seed__uri='https://example.com/some/rss.xml').export_as_csv() -> str
			
 
				-        Crawl.objects.filter(seed__uri='https://example.com/some/rss.xml').export_as_json() -> str
			
 
				-    """
			
 
				-    def snapshots(self, **filter_kwargs) -> QuerySet['Snapshot']:
			
 
				-        return Snapshot.objects.filter(crawl_id__in=self.values_list('pk', flat=True), **filter_kwargs)
			
 
				-    
			
 
				-    def archiveresults(self) -> QuerySet['ArchiveResult']:
			
 
				-        return ArchiveResult.objects.filter(snapshot__crawl_id__in=self.values_list('pk', flat=True))
			
 
				-    
			
 
				-    def as_csv_str(self, keys: Iterable[str]=()) -> str:
			
 
				-        return '\n'.join(
			
 
				-            row.as_csv(keys=keys)
			
 
				-            for row in self.all()
			
 
				-        )
			
 
				-    
			
 
				-    def as_jsonl_str(self, keys: Iterable[str]=()) -> str:
			
 
				-        return '\n'.join([
			
 
				-            row.as_jsonl_row(keys=keys)
			
 
				-            for row in self.all()
			
 
				-        ])
			
 
				-
			
 
				-
			
 
				-
			
 
				-class Crawl(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ModelWithKVTags, ABIDModel, ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWithStateMachine):
			
 
				-    """
			
 
				-    A single session of URLs to archive starting from a given Seed and expanding outwards. An "archiving session" so to speak.
			
 
				-
			
 
				-    A new Crawl should be created for each loading from a Seed (because it can produce a different set of URLs every time its loaded).
			
 
				-    E.g. every scheduled import from an RSS feed should create a new Crawl, and more loadings from the same seed each create a new Crawl
			
 
				-    
			
 
				-    Every "Add" task triggered from the Web UI, CLI, or Scheduled Crawl should create a new Crawl with the seed set to a 
			
 
				-    file URI e.g. file:///sources/<date>_{ui,cli}_add.txt containing the user's input.
			
 
				-    """
			
 
				-    
			
 
				-    ### ModelWithReadOnlyFields:
			
 
				-    read_only_fields = ('id', 'abid', 'created_at', 'created_by', 'seed')
			
 
				-    
			
 
				-    ### Immutable fields:
			
 
				-    id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
			
 
				-    abid = ABIDField(prefix=abid_prefix)
			
 
				-    created_at = AutoDateTimeField(default=None, null=False, db_index=True)
			
 
				-    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
			
 
				-    seed = models.ForeignKey(Seed, on_delete=models.PROTECT, related_name='crawl_set', null=False, blank=False)
			
 
				-    
			
 
				-    ### Mutable fields:
			
 
				-    urls = models.TextField(blank=True, null=False, default='', help_text='The log of URLs discovered in this crawl, one per line, should be 1:1 with snapshot_set')
			
 
				-    config = models.JSONField(default=dict)
			
 
				-    max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)])
			
 
				-    tags_str = models.CharField(max_length=1024, blank=True, null=False, default='')
			
 
				-    persona_id = models.UUIDField(null=True, blank=True)  # TODO: replace with self.persona = models.ForeignKey(Persona, on_delete=models.SET_NULL, null=True, blank=True, editable=True)
			
 
				-    label = models.CharField(max_length=64, blank=True, null=False, default='', help_text='A human-readable label for this crawl')
			
 
				-    notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this crawl should have')
			
 
				-    schedule = models.ForeignKey(CrawlSchedule, on_delete=models.SET_NULL, null=True, blank=True, editable=True)
			
 
				-    modified_at = models.DateTimeField(auto_now=True)
			
 
				-    
			
 
				-    ### ModelWithKVTags:
			
 
				-    tag_set = GenericRelation(
			
 
				-        KVTag,
			
 
				-        related_query_name="crawl",
			
 
				-        content_type_field="obj_type",
			
 
				-        object_id_field="obj_id",
			
 
				-        order_by=('name',),
			
 
				-    )
			
 
				-    
			
 
				-    ### ModelWithStateMachine:
			
 
				-    state_machine_name = 'crawls.statemachines.CrawlMachine'
			
 
				-    retry_at_field_name = 'retry_at'
			
 
				-    state_field_name = 'status'
			
 
				-    StatusChoices = ModelWithStateMachine.StatusChoices
			
 
				-    active_state = StatusChoices.STARTED
			
 
				-    
			
 
				-    status = ModelWithStateMachine.StatusField(choices=StatusChoices, default=StatusChoices.QUEUED)
			
 
				-    retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
			
 
				-
			
 
				-    ### ABIDModel:
			
 
				-    abid_prefix = 'cwl_'
			
 
				-    abid_ts_src = 'self.created_at'
			
 
				-    abid_uri_src = 'self.seed.uri'
			
 
				-    abid_subtype_src = 'self.persona'
			
 
				-    abid_rand_src = 'self.id'
			
 
				-    abid_drift_allowed = True
			
 
				-    
			
 
				-    ### Managers:    
			
 
				-    snapshot_set: models.Manager['Snapshot']
			
 
				-    
			
 
				-    # @property
			
 
				-    # def persona(self) -> Persona:
			
 
				-    #     # TODO: replace with self.persona = models.ForeignKey(Persona, on_delete=models.SET_NULL, null=True, blank=True, editable=True)
			
 
				-    #     return self.persona_id
			
 
				-    
			
 
				-
			
 
				-    class Meta(TypedModelMeta):
			
 
				-        verbose_name = 'Crawl'
			
 
				-        verbose_name_plural = 'Crawls'
			
 
				-        
			
 
				-    def __str__(self):
			
 
				-        url = (self.seed and self.seed.uri) or '<no url set>'
			
 
				-        parser = (self.seed and self.seed.extractor) or 'auto'
			
 
				-        created_at = self.created_at.strftime("%Y-%m-%d %H:%M") if self.created_at else '<no timestamp set>'
			
 
				-        if self.id and self.seed:
			
 
				-            return f'[{self.ABID}] {url[:64]} ({parser}) @ {created_at} ({self.label or "Untitled Crawl"})'
			
 
				-        return f'[{self.abid_prefix}****not*saved*yet****] {url[:64]} ({parser}) @ {created_at} ({self.label or "Untitled Crawl"})'
			
 
				-        
			
 
				-    @classmethod
			
 
				-    def from_seed(cls, seed: Seed, max_depth: int=0, persona: str='Default', tags_str: str='', config: dict|None=None, created_by: int|None=None):
			
 
				-        crawl, _ = cls.objects.get_or_create(
			
 
				-            seed=seed,
			
 
				-            max_depth=max_depth,
			
 
				-            tags_str=tags_str or seed.tags_str,
			
 
				-            persona=persona or seed.config.get('DEFAULT_PERSONA') or 'Default',
			
 
				-            config=seed.config or config or {},
			
 
				-            created_by_id=getattr(created_by, 'pk', created_by) or seed.created_by_id,
			
 
				-        )
			
 
				-        crawl.save()
			
 
				-        return crawl
			
 
				-        
			
 
				-    @property
			
 
				-    def template(self):
			
 
				-        """If this crawl was created under a ScheduledCrawl, returns the original template Crawl it was based off"""
			
 
				-        if not self.schedule:
			
 
				-            return None
			
 
				-        return self.schedule.template
			
 
				-
			
 
				-    @property
			
 
				-    def api_url(self) -> str:
			
 
				-        # /api/v1/core/crawl/{uulid}
			
 
				-        # TODO: implement get_crawl
			
 
				-        return reverse_lazy('api-1:get_crawl', args=[self.abid])  # + f'?api_key={get_or_create_api_token(request.user)}'
			
 
				-
			
 
				-    @property
			
 
				-    def api_docs_url(self) -> str:
			
 
				-        return '/api/v1/docs#/Core%20Models/api_v1_core_get_crawl'
			
 
				-    
			
 
				-    def pending_snapshots(self) -> QuerySet['Snapshot']:
			
 
				-        return self.snapshot_set.filter(retry_at__isnull=False)
			
 
				-    
			
 
				-    def pending_archiveresults(self) -> QuerySet['ArchiveResult']:
			
 
				-        from core.models import ArchiveResult
			
 
				-        
			
 
				-        snapshot_ids = self.snapshot_set.values_list('id', flat=True)
			
 
				-        pending_archiveresults = ArchiveResult.objects.filter(snapshot_id__in=snapshot_ids, retry_at__isnull=False)
			
 
				-        return pending_archiveresults
			
 
				-    
			
 
				-    def create_root_snapshot(self) -> 'Snapshot':
			
 
				-        print(f'Crawl[{self.ABID}].create_root_snapshot()')
			
 
				-        from core.models import Snapshot
			
 
				-        
			
 
				-        try:
			
 
				-            return Snapshot.objects.get(crawl=self, url=self.seed.uri)
			
 
				-        except Snapshot.DoesNotExist:
			
 
				-            pass
			
 
				-
			
 
				-        root_snapshot, _ = Snapshot.objects.update_or_create(
			
 
				-            crawl=self,
			
 
				-            url=self.seed.uri,
			
 
				-            defaults={
			
 
				-                'status': Snapshot.INITIAL_STATE,
			
 
				-                'retry_at': timezone.now(),
			
 
				-                'timestamp': str(timezone.now().timestamp()),
			
 
				-                # 'config': self.seed.config,
			
 
				-            },
			
 
				-        )
			
 
				-        root_snapshot.save()
			
 
				-        return root_snapshot
			
 
				-
			
 
				-
			
 
				-class Outlink(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ModelWithKVTags):
			
 
				-    """A record of a link found on a page, pointing to another page."""
			
 
				-    read_only_fields = ('id', 'src', 'dst', 'crawl', 'via')
			
 
				-    
			
 
				-    id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
			
 
				-    
			
 
				-    src = models.URLField()   # parent page where the outlink/href was found       e.g. https://example.com/downloads
			
 
				-    dst = models.URLField()   # remote location the child outlink/href points to   e.g. https://example.com/downloads/some_file.pdf
			
 
				-    
			
 
				-    crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, null=False, blank=False, related_name='outlink_set')
			
 
				-    via = models.ForeignKey('core.ArchiveResult', on_delete=models.SET_NULL, null=True, blank=True, related_name='outlink_set')
			
 
				-
			
 
				-    class Meta:
			
 
				-        unique_together = (('src', 'dst', 'via'),)
			
 
				-
			
 
				-
			
 
				-
			
 
				 
			
 
				 def validate_timestamp(value):
			
 
				     assert isinstance(value, str) and value, f'timestamp must be a non-empty string, got: "{value}"'
			
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -12,7 +12,7 @@ from django.urls import reverse_lazy
 
				 from django.utils import timezone
			
 
				 
			
 
				 from archivebox.config import CONSTANTS
			
 
				-from archivebox.base_models.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats, get_or_create_system_user_pk
			
 
				+from base_models.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats, get_or_create_system_user_pk
			
 
				 
			
 
				 from workers.models import ModelWithStateMachine
			
 
				 
			
@@ -21,7 +21,8 @@ if TYPE_CHECKING:
 
				 
			
 
				 
			
 
				 
			
 
				-class Seed(ABIDModel, ModelWithHealthStats):
			
 
				+
			
 
				+class Seed(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ModelWithKVTags, ABIDModel, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats):
			
 
				     """
			
 
				     A fountain that produces URLs (+metadata) each time it's queried e.g.
			
 
				         - file:///data/sources/2024-01-02_11-57-51__cli_add.txt
			
@@ -42,36 +43,55 @@ class Seed(ABIDModel, ModelWithHealthStats):
 
				     stateful remote services, files with contents that change, directories that have new files within, etc.
			
 
				     """
			
 
				     
			
 
				-    abid_prefix = 'src_'
			
 
				-    abid_ts_src = 'self.created_at'
			
 
				-    abid_uri_src = 'self.uri'
			
 
				-    abid_subtype_src = 'self.extractor'
			
 
				-    abid_rand_src = 'self.id'
			
 
				-    abid_drift_allowed = True
			
 
				+    ### ModelWithReadOnlyFields:
			
 
				+    read_only_fields = ('id', 'abid', 'created_at', 'created_by', 'uri')
			
 
				     
			
 
				+    ### Immutable fields
			
 
				     id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
			
 
				     abid = ABIDField(prefix=abid_prefix)
			
 
				+    created_at = AutoDateTimeField(default=None, null=False, db_index=True)                  # unique source location where URLs will be loaded from
			
 
				+    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
			
 
				     
			
 
				-    uri = models.URLField(max_length=2000, blank=False, null=False)                          # unique source location where URLs will be loaded from
			
 
				-    label = models.CharField(max_length=255, null=False, blank=True, default='', help_text='A human-readable label for this seed')
			
 
				-    notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this seed should have')
			
 
				-    
			
 
				+    ### Mutable fields:
			
 
				     extractor = models.CharField(default='auto', max_length=32, help_text='The parser / extractor to use to load URLs from this source (default: auto)')
			
 
				     tags_str = models.CharField(max_length=255, null=False, blank=True, default='', help_text='An optional comma-separated list of tags to attach to any URLs that come from this source')
			
 
				-    config = models.JSONField(default=dict, help_text='An optional JSON object containing extra config to put in scope when loading URLs from this source')
			
 
				-    
			
 
				-    created_at = AutoDateTimeField(default=None, null=False, db_index=True)
			
 
				+    label = models.CharField(max_length=255, null=False, blank=True, default='', help_text='A human-readable label for this seed')
			
 
				     modified_at = models.DateTimeField(auto_now=True)
			
 
				-    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
			
 
				 
			
 
				+    ### ModelWithConfig:
			
 
				+    config = models.JSONField(default=dict, help_text='An optional JSON object containing extra config to put in scope when loading URLs from this source')
			
 
				+
			
 
				+    ### ModelWithOutputDir:
			
 
				+    output_dir = models.CharField(max_length=255, null=False, blank=True, default='', help_text='The directory to store the output of this seed')
			
 
				 
			
 
				+    ### ModelWithNotes:
			
 
				+    notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this seed should have')
			
 
				+
			
 
				+    ### ModelWithKVTags:
			
 
				+    tag_set = GenericRelation(
			
 
				+        KVTag,
			
 
				+        related_query_name="seed",
			
 
				+        content_type_field="obj_type",
			
 
				+        object_id_field="obj_id",
			
 
				+        order_by=('name',),
			
 
				+    )
			
 
				+    
			
 
				+    ### ABIDModel:
			
 
				+    abid_prefix = 'src_'
			
 
				+    abid_ts_src = 'self.created_at'
			
 
				+    abid_uri_src = 'self.uri'
			
 
				+    abid_subtype_src = 'self.extractor'
			
 
				+    abid_rand_src = 'self.id'
			
 
				+    abid_drift_allowed = True
			
 
				+    
			
 
				+    ### Managers:
			
 
				     crawl_set: models.Manager['Crawl']
			
 
				 
			
 
				     class Meta:
			
 
				         verbose_name = 'Seed'
			
 
				         verbose_name_plural = 'Seeds'
			
 
				         
			
 
				-        unique_together = (('created_by', 'uri', 'extractor'),)
			
 
				+        unique_together = (('created_by', 'uri', 'extractor'),('created_by', 'label'))
			
 
				 
			
 
				 
			
 
				     @classmethod
			
@@ -122,35 +142,48 @@ class Seed(ABIDModel, ModelWithHealthStats):
 
				 
			
 
				 
			
 
				 
			
 
				-
			
 
				-class CrawlSchedule(ABIDModel, ModelWithHealthStats):
			
 
				+class CrawlSchedule(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ModelWithKVTags, ABIDModel, ModelWithNotes, ModelWithHealthStats):
			
 
				     """
			
 
				     A record for a job that should run repeatedly on a given schedule.
			
 
				     
			
 
				     It pulls from a given Seed and creates a new Crawl for each scheduled run.
			
 
				     The new Crawl will inherit all the properties of the crawl_template Crawl.
			
 
				     """
			
 
				-    abid_prefix = 'cws_'
			
 
				-    abid_ts_src = 'self.created_at'
			
 
				-    abid_uri_src = 'self.created_by_id'
			
 
				-    abid_subtype_src = 'self.schedule'
			
 
				-    abid_rand_src = 'self.id'
			
 
				+    ### ModelWithReadOnlyFields:
			
 
				+    read_only_fields = ('id', 'abid', 'created_at', 'created_by', 'template_id')
			
 
				     
			
 
				+    ### Immutable fields:
			
 
				     id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
			
 
				     abid = ABIDField(prefix=abid_prefix)
			
 
				+    created_at = AutoDateTimeField(default=None, null=False, db_index=True)
			
 
				+    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
			
 
				+    template: 'Crawl' = models.ForeignKey('Crawl', on_delete=models.CASCADE, null=False, blank=False, help_text='The base crawl that each new scheduled job should copy as a template')  # type: ignore
			
 
				     
			
 
				+    ### Mutable fields
			
 
				     schedule = models.CharField(max_length=64, blank=False, null=False, help_text='The schedule to run this crawl on in CRON syntax e.g. 0 0 * * * (see https://crontab.guru/)')
			
 
				+    is_enabled = models.BooleanField(default=True)
			
 
				     label = models.CharField(max_length=64, blank=True, null=False, default='', help_text='A human-readable label for this scheduled crawl')
			
 
				     notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this crawl should have')
			
 
				-    
			
 
				-    template: 'Crawl' = models.ForeignKey('Crawl', on_delete=models.CASCADE, null=False, blank=False, help_text='The base crawl that each new scheduled job should copy as a template')  # type: ignore
			
 
				-    
			
 
				-    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
			
 
				-    created_at = AutoDateTimeField(default=None, null=False, db_index=True)
			
 
				     modified_at = models.DateTimeField(auto_now=True)
			
 
				     
			
 
				-    is_enabled = models.BooleanField(default=True)
			
 
				+    ### ModelWithKVTags:
			
 
				+    tag_set = GenericRelation(
			
 
				+        KVTag,
			
 
				+        related_query_name="crawlschedule",
			
 
				+        content_type_field="obj_type",
			
 
				+        object_id_field="obj_id",
			
 
				+        order_by=('name',),
			
 
				+    )
			
 
				+    
			
 
				+    ### ABIDModel:
			
 
				+    abid_prefix = 'cws_'
			
 
				+    abid_ts_src = 'self.created_at'
			
 
				+    abid_uri_src = 'self.template.seed.uri'
			
 
				+    abid_subtype_src = 'self.template.persona'
			
 
				+    abid_rand_src = 'self.id'
			
 
				+    abid_drift_allowed = True
			
 
				     
			
 
				+    ### Managers:
			
 
				     crawl_set: models.Manager['Crawl']
			
 
				     
			
 
				     class Meta(TypedModelMeta):
			
@@ -189,9 +222,44 @@ class CrawlSchedule(ABIDModel, ModelWithHealthStats):
 
				         return Snapshot.objects.filter(crawl_id__in=crawl_ids)
			
 
				     
			
 
				 
			
 
				+class CrawlManager(models.Manager):
			
 
				+    pass
			
 
				+
			
 
				+class CrawlQuerySet(models.QuerySet):
			
 
				+    """
			
 
				+    Enhanced QuerySet for Crawl that adds some useful methods.
			
 
				+    
			
 
				+    To get all the snapshots for a given set of Crawls:
			
 
				+        Crawl.objects.filter(seed__uri='https://example.com/some/rss.xml').snapshots() -> QuerySet[Snapshot]
			
 
				+    
			
 
				+    To get all the archiveresults for a given set of Crawls:
			
 
				+        Crawl.objects.filter(seed__uri='https://example.com/some/rss.xml').archiveresults() -> QuerySet[ArchiveResult]
			
 
				+    
			
 
				+    To export the list of Crawls as a CSV or JSON:
			
 
				+        Crawl.objects.filter(seed__uri='https://example.com/some/rss.xml').export_as_csv() -> str
			
 
				+        Crawl.objects.filter(seed__uri='https://example.com/some/rss.xml').export_as_json() -> str
			
 
				+    """
			
 
				+    def snapshots(self, **filter_kwargs) -> QuerySet['Snapshot']:
			
 
				+        return Snapshot.objects.filter(crawl_id__in=self.values_list('pk', flat=True), **filter_kwargs)
			
 
				     
			
 
				+    def archiveresults(self) -> QuerySet['ArchiveResult']:
			
 
				+        return ArchiveResult.objects.filter(snapshot__crawl_id__in=self.values_list('pk', flat=True))
			
 
				+    
			
 
				+    def as_csv_str(self, keys: Iterable[str]=()) -> str:
			
 
				+        return '\n'.join(
			
 
				+            row.as_csv(keys=keys)
			
 
				+            for row in self.all()
			
 
				+        )
			
 
				+    
			
 
				+    def as_jsonl_str(self, keys: Iterable[str]=()) -> str:
			
 
				+        return '\n'.join([
			
 
				+            row.as_jsonl_row(keys=keys)
			
 
				+            for row in self.all()
			
 
				+        ])
			
 
				 
			
 
				-class Crawl(ABIDModel, ModelWithHealthStats, ModelWithStateMachine):
			
 
				+
			
 
				+
			
 
				+class Crawl(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ModelWithKVTags, ABIDModel, ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWithStateMachine):
			
 
				     """
			
 
				     A single session of URLs to archive starting from a given Seed and expanding outwards. An "archiving session" so to speak.
			
 
				 
			
@@ -201,49 +269,63 @@ class Crawl(ABIDModel, ModelWithHealthStats, ModelWithStateMachine):
 
				     Every "Add" task triggered from the Web UI, CLI, or Scheduled Crawl should create a new Crawl with the seed set to a 
			
 
				     file URI e.g. file:///sources/<date>_{ui,cli}_add.txt containing the user's input.
			
 
				     """
			
 
				-    abid_prefix = 'cwl_'
			
 
				-    abid_ts_src = 'self.created_at'
			
 
				-    abid_uri_src = 'self.seed.uri'
			
 
				-    abid_subtype_src = 'self.persona'
			
 
				-    abid_rand_src = 'self.id'
			
 
				-    abid_drift_allowed = True
			
 
				     
			
 
				-    state_machine_name = 'crawls.statemachines.CrawlMachine'
			
 
				-    retry_at_field_name = 'retry_at'
			
 
				-    state_field_name = 'status'
			
 
				-    StatusChoices = ModelWithStateMachine.StatusChoices
			
 
				-    active_state = StatusChoices.STARTED
			
 
				+    ### ModelWithReadOnlyFields:
			
 
				+    read_only_fields = ('id', 'abid', 'created_at', 'created_by', 'seed')
			
 
				     
			
 
				+    ### Immutable fields:
			
 
				     id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
			
 
				     abid = ABIDField(prefix=abid_prefix)
			
 
				-
			
 
				-    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='crawl_set')
			
 
				     created_at = AutoDateTimeField(default=None, null=False, db_index=True)
			
 
				-    modified_at = models.DateTimeField(auto_now=True)
			
 
				-    
			
 
				-    status = ModelWithStateMachine.StatusField(choices=StatusChoices, default=StatusChoices.QUEUED)
			
 
				-    retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
			
 
				-
			
 
				+    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
			
 
				     seed = models.ForeignKey(Seed, on_delete=models.PROTECT, related_name='crawl_set', null=False, blank=False)
			
 
				-    urls = models.TextField(blank=True, null=False, default='', help_text='The log of URLs discovered in this crawl')
			
 
				     
			
 
				+    ### Mutable fields:
			
 
				+    urls = models.TextField(blank=True, null=False, default='', help_text='The log of URLs discovered in this crawl, one per line, should be 1:1 with snapshot_set')
			
 
				+    config = models.JSONField(default=dict)
			
 
				+    max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)])
			
 
				+    tags_str = models.CharField(max_length=1024, blank=True, null=False, default='')
			
 
				+    persona_id = models.UUIDField(null=True, blank=True)  # TODO: replace with self.persona = models.ForeignKey(Persona, on_delete=models.SET_NULL, null=True, blank=True, editable=True)
			
 
				     label = models.CharField(max_length=64, blank=True, null=False, default='', help_text='A human-readable label for this crawl')
			
 
				     notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this crawl should have')
			
 
				+    schedule = models.ForeignKey(CrawlSchedule, on_delete=models.SET_NULL, null=True, blank=True, editable=True)
			
 
				+    modified_at = models.DateTimeField(auto_now=True)
			
 
				     
			
 
				-    max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)])
			
 
				-    tags_str = models.CharField(max_length=1024, blank=True, null=False, default='')
			
 
				-    persona = models.CharField(max_length=32, blank=True, null=False, default='auto')
			
 
				-    config = models.JSONField(default=dict)
			
 
				+    ### ModelWithKVTags:
			
 
				+    tag_set = GenericRelation(
			
 
				+        KVTag,
			
 
				+        related_query_name="crawl",
			
 
				+        content_type_field="obj_type",
			
 
				+        object_id_field="obj_id",
			
 
				+        order_by=('name',),
			
 
				+    )
			
 
				     
			
 
				-    schedule = models.ForeignKey(CrawlSchedule, on_delete=models.SET_NULL, null=True, blank=True, editable=True)
			
 
				+    ### ModelWithStateMachine:
			
 
				+    state_machine_name = 'crawls.statemachines.CrawlMachine'
			
 
				+    retry_at_field_name = 'retry_at'
			
 
				+    state_field_name = 'status'
			
 
				+    StatusChoices = ModelWithStateMachine.StatusChoices
			
 
				+    active_state = StatusChoices.STARTED
			
 
				     
			
 
				-    # crawler = models.CharField(choices=CRAWLER_CHOICES, default='breadth_first', max_length=32)
			
 
				-    # tags = models.ManyToManyField(Tag, blank=True, related_name='crawl_set', through='CrawlTag')
			
 
				-    # schedule = models.JSONField()
			
 
				-    # config = models.JSONField()
			
 
				+    status = ModelWithStateMachine.StatusField(choices=StatusChoices, default=StatusChoices.QUEUED)
			
 
				+    retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
			
 
				+
			
 
				+    ### ABIDModel:
			
 
				+    abid_prefix = 'cwl_'
			
 
				+    abid_ts_src = 'self.created_at'
			
 
				+    abid_uri_src = 'self.seed.uri'
			
 
				+    abid_subtype_src = 'self.persona'
			
 
				+    abid_rand_src = 'self.id'
			
 
				+    abid_drift_allowed = True
			
 
				     
			
 
				+    ### Managers:    
			
 
				     snapshot_set: models.Manager['Snapshot']
			
 
				     
			
 
				+    # @property
			
 
				+    # def persona(self) -> Persona:
			
 
				+    #     # TODO: replace with self.persona = models.ForeignKey(Persona, on_delete=models.SET_NULL, null=True, blank=True, editable=True)
			
 
				+    #     return self.persona_id
			
 
				+    
			
 
				 
			
 
				     class Meta(TypedModelMeta):
			
 
				         verbose_name = 'Crawl'
			
@@ -305,7 +387,7 @@ class Crawl(ABIDModel, ModelWithHealthStats, ModelWithStateMachine):
 
				             return Snapshot.objects.get(crawl=self, url=self.seed.uri)
			
 
				         except Snapshot.DoesNotExist:
			
 
				             pass
			
 
				-  
			
 
				+
			
 
				         root_snapshot, _ = Snapshot.objects.update_or_create(
			
 
				             crawl=self,
			
 
				             url=self.seed.uri,
			
@@ -320,8 +402,10 @@ class Crawl(ABIDModel, ModelWithHealthStats, ModelWithStateMachine):
 
				         return root_snapshot
			
 
				 
			
 
				 
			
 
				-class Outlink(models.Model):
			
 
				+class Outlink(ModelWithReadOnlyFields, ModelWithSerializers, ModelWithUUID, ModelWithKVTags):
			
 
				     """A record of a link found on a page, pointing to another page."""
			
 
				+    read_only_fields = ('id', 'src', 'dst', 'crawl', 'via')
			
 
				+    
			
 
				     id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
			
 
				     
			
 
				     src = models.URLField()   # parent page where the outlink/href was found       e.g. https://example.com/downloads
			
--- a/archivebox/workers/actor.py
+++ b/archivebox/workers/actor.py
@@ -1,583 +1,166 @@
 
				-__package__ = 'archivebox.workers'
			
 
				-
			
 
				-import os
			
 
				-import time
			
 
				-import traceback
			
 
				-from typing import ClassVar, Generic, TypeVar, Any, Literal, Type, Iterable, cast, get_args
			
 
				-from datetime import timedelta
			
 
				-import multiprocessing
			
 
				-from multiprocessing import Process, cpu_count
			
 
				-
			
 
				-import psutil
			
 
				-from rich import print
			
 
				-from statemachine import State, StateMachine
			
 
				-
			
 
				-from django import db
			
 
				-from django.db.models import QuerySet, sql, Q
			
 
				-from django.db.models import Model as DjangoModel
			
 
				-from django.utils import timezone
			
 
				-from django.utils.functional import classproperty
			
 
				-
			
 
				-# from archivebox.logging_util import TimedProgress
			
 
				-
			
 
				-from .models import ModelWithStateMachine
			
 
				-
			
 
				-
			
 
				-multiprocessing.set_start_method('fork', force=True)
			
 
				-
			
 
				-
			
 
				-class ActorObjectAlreadyClaimed(Exception):
			
 
				-    """Raised when the Actor tries to claim the next object from the queue but it's already been claimed by another Actor"""
			
 
				-    pass
			
 
				-
			
 
				-class ActorQueueIsEmpty(Exception):
			
 
				-    """Raised when the Actor tries to get the next object from the queue but it's empty"""
			
 
				-    pass
			
 
				-
			
 
				-CPU_COUNT = cpu_count()
			
 
				-DEFAULT_MAX_TICK_TIME = 60
			
 
				-DEFAULT_MAX_CONCURRENT_ACTORS = min(max(2, int(CPU_COUNT * 0.6)), 8)   # 2 < (60% * num available cpu cores) < 8
			
 
				-
			
 
				-limit = lambda n, max: min(n, max)
			
 
				-
			
 
				-LaunchKwargs = dict[str, Any]
			
 
				-ObjectState = State | str
			
 
				-ObjectStateList = Iterable[ObjectState]
			
 
				-
			
 
				-ModelType = TypeVar('ModelType', bound=ModelWithStateMachine)
			
 
				-
			
 
				-class ActorType(Generic[ModelType]):
			
 
				-    """
			
 
				-    Base class for all actors. Usage:
			
 
				-    
			
 
				-    class FaviconActor(ActorType[FaviconArchiveResult]):
			
 
				-        ACTIVE_STATE: ClassVar[str] = 'started'
			
 
				-        
			
 
				-        @classmethod
			
 
				-        def qs(cls) -> QuerySet[FaviconArchiveResult]:
			
 
				-            return ArchiveResult.objects.filter(extractor='favicon')   # or leave the default: FaviconArchiveResult.objects.all()
			
 
				-    """
			
 
				-    
			
 
				-    ### Class attributes (defined on the class at compile-time when ActorType[MyModel] is defined)
			
 
				-    Model: Type[ModelType]
			
 
				-    StateMachineClass: Type[StateMachine]
			
 
				-    
			
 
				-    ACTIVE_STATE: ClassVar[ObjectState] = 'started'
			
 
				-    EVENT_NAME: ClassVar[str] = 'tick'                                    # the event name to trigger on the obj.sm: StateMachine (usually 'tick')
			
 
				-    
			
 
				-    CLAIM_ORDER: ClassVar[tuple[str, ...]] = ('-retry_at',)                # the .order(*args) to claim the queue objects in, use ('?',) for random order
			
 
				-    CLAIM_FROM_TOP_N: ClassVar[int] = CPU_COUNT * 10                      # the number of objects to consider when atomically getting the next object from the queue
			
 
				-    CLAIM_ATOMIC: ClassVar[bool] = True                                   # whether to atomically fetch+claim the next object in one query, or fetch and lock it in two queries
			
 
				-    
			
 
				-    MAX_TICK_TIME: ClassVar[int] = DEFAULT_MAX_TICK_TIME                  # maximum duration in seconds to process a single object
			
 
				-    MAX_CONCURRENT_ACTORS: ClassVar[int] = DEFAULT_MAX_CONCURRENT_ACTORS  # maximum number of concurrent actors that can be running at once
			
 
				-    
			
 
				-    _SPAWNED_ACTOR_PIDS: ClassVar[list[psutil.Process]] = []      # used to record all the pids of Actors spawned on the class
			
 
				-    
			
 
				-    ### Instance attributes (only used within an actor instance inside a spawned actor thread/process)
			
 
				-    pid: int = os.getpid()
			
 
				-    idle_count: int = 0
			
 
				-    launch_kwargs: LaunchKwargs = {}
			
 
				-    mode: Literal['thread', 'process'] = 'process'
			
 
				-    
			
 
				-    def __init_subclass__(cls) -> None:
			
 
				-        """
			
 
				-        Executed at class definition time (i.e. during import of any file containing class MyActor(ActorType[MyModel]): ...).
			
 
				-        Loads the django Model from the Generic[ModelType] TypeVar arg and populates any missing class-level config using it.
			
 
				-        """
			
 
				-        if getattr(cls, 'Model', None) is None:
			
 
				-            cls.Model = cls._get_model_from_generic_typevar()
			
 
				-        cls._populate_missing_classvars_from_model(cls.Model)
			
 
				-    
			
 
				-    def __init__(self, mode: Literal['thread', 'process']|None=None, **launch_kwargs: LaunchKwargs):
			
 
				-        """
			
 
				-        Executed right before the Actor is spawned to create a unique Actor instance for that thread/process.
			
 
				-        actor_instance.runloop() is then executed from inside the newly spawned thread/process.
			
 
				-        """
			
 
				-        self.mode = mode or self.mode
			
 
				-        self.launch_kwargs = launch_kwargs or dict(self.launch_kwargs)
			
 
				-    
			
 
				-
			
 
				-    ### Private Helper Methods: Not desiged to be overridden by subclasses or called by anything outside of this class
			
 
				-    
			
 
				-    @classproperty
			
 
				-    def name(cls) -> str:
			
 
				-        return cls.__name__  # type: ignore
			
 
				-    
			
 
				-    def __str__(self) -> str:
			
 
				-        return repr(self)
			
 
				-    
			
 
				-    def __repr__(self) -> str:
			
 
				-        """-> FaviconActor[pid=1234]"""
			
 
				-        label = 'pid' if self.mode == 'process' else 'tid'
			
 
				-        # return f'[underline]{self.name}[/underline]\\[{label}={self.pid}]'
			
 
				-        return f'[underline]Worker[/underline]\\[{label}={self.pid}]'
			
 
				-    
			
 
				-    @staticmethod
			
 
				-    def _state_to_str(state: ObjectState) -> str:
			
 
				-        """Convert a statemachine.State, models.TextChoices.choices value, or Enum value to a str"""
			
 
				-        return str(state.value) if isinstance(state, State) else str(state)
			
 
				-    
			
 
				-    @staticmethod
			
 
				-    def _sql_for_select_top_n_candidates(qs: QuerySet, claim_from_top_n: int=CLAIM_FROM_TOP_N) -> tuple[str, tuple[Any, ...]]:
			
 
				-        """Get the SQL for selecting the top N candidates from the queue (to claim one from)"""
			
 
				-        queryset = qs.only('id')[:claim_from_top_n]
			
 
				-        select_sql, select_params = compile_sql_select(queryset)
			
 
				-        return select_sql, select_params
			
 
				-    
			
 
				-    @staticmethod
			
 
				-    def _sql_for_update_claimed_obj(qs: QuerySet, update_kwargs: dict[str, Any]) -> tuple[str, tuple[Any, ...]]:
			
 
				-        """Get the SQL for updating a claimed object to mark it as ACTIVE"""
			
 
				-        # qs.update(status='started', retry_at=<now + MAX_TICK_TIME>)
			
 
				-        update_sql, update_params = compile_sql_update(qs, update_kwargs=update_kwargs)
			
 
				-        # e.g. UPDATE core_archiveresult SET status='%s', retry_at='%s' WHERE status NOT IN ('succeeded', 'failed', 'sealed', 'started') AND retry_at <= '2024-11-04 10:14:33.240903'
			
 
				-        return update_sql, update_params
			
 
				-    
			
 
				-    @classmethod
			
 
				-    def _get_model_from_generic_typevar(cls) -> Type[ModelType]:
			
 
				-        """Get the django Model from the Generic[ModelType] TypeVar arg (and check that it inherits from django.db.models.Model)"""
			
 
				-        # cls.__orig_bases__ is non-standard and may be removed in the future! if this breaks,
			
 
				-        # we can just require the inerited class to define the Model as a classvar manually, e.g.:
			
 
				-        #     class SnapshotActor(ActorType[Snapshot]):
			
 
				-        #         Model: ClassVar[Type[Snapshot]] = Snapshot
			
 
				-        # https://stackoverflow.com/questions/57706180/generict-base-class-how-to-get-type-of-t-from-within-instance
			
 
				-        Model = get_args(cls.__orig_bases__[0])[0]   # type: ignore
			
 
				-        assert issubclass(Model, DjangoModel), f'{cls.__name__}.Model must be a valid django Model'
			
 
				-        return cast(Type[ModelType], Model)
			
 
				-    
			
 
				-
			
 
				-    @classmethod
			
 
				-    def _get_state_machine_instance(cls, obj: ModelType) -> StateMachine:
			
 
				-        """Get the StateMachine instance for the given django Model instance (and check that it is a valid instance of cls.StateMachineClass)"""
			
 
				-        obj_statemachine = None
			
 
				-        state_machine_attr = getattr(obj, 'state_machine_attr', 'sm')
			
 
				-        try:
			
 
				-            obj_statemachine = getattr(obj, state_machine_attr)
			
 
				-        except Exception:
			
 
				-            pass
			
 
				-        
			
 
				-        if not isinstance(obj_statemachine, cls.StateMachineClass):
			
 
				-            raise Exception(f'{cls.__name__}: Failed to find a valid StateMachine instance at {type(obj).__name__}.{state_machine_attr}')
			
 
				+# __package__ = 'archivebox.workers'
			
 
				+
			
 
				+# import time
			
 
				+
			
 
				+
			
 
				+# from typing import ClassVar, Type, Iterable, TypedDict
			
 
				+# from django.db.models import QuerySet
			
 
				+# from django.db import transaction
			
 
				+# from django.utils import timezone
			
 
				+# from django.utils.functional import classproperty       # type: ignore
			
 
				+
			
 
				+# from .models import Event, Process, EventDict
			
 
				+
			
 
				+
			
 
				+# class ActorType:
			
 
				+#     # static class attributes
			
 
				+#     name: ClassVar[str]
			
 
				+#     event_prefix: ClassVar[str]
			
 
				+#     poll_interval: ClassVar[int] = 1
			
 
				+    
			
 
				+#     @classproperty
			
 
				+#     def event_queue(cls) -> QuerySet[Event]:
			
 
				+#         return Event.objects.filter(type__startswith=cls.event_prefix)
			
 
				+
			
 
				+#     @classmethod
			
 
				+#     def fork(cls, wait_for_first_event=False, exit_on_idle=True) -> Process:
			
 
				+#         cmd = ['archivebox', 'actor', cls.name]
			
 
				+#         if exit_on_idle:
			
 
				+#             cmd.append('--exit-on-idle')
			
 
				+#         if wait_for_first_event:
			
 
				+#             cmd.append('--wait-for-first-event')
			
 
				+#         return Process.create_and_fork(cmd=cmd, actor_type=cls.name)
			
 
				+
			
 
				+#     @classproperty
			
 
				+#     def processes(cls) -> QuerySet[Process]:
			
 
				+#         return Process.objects.filter(actor_type=cls.name)
			
 
				+
			
 
				+#     @classmethod
			
 
				+#     def run(cls, wait_for_first_event=False, exit_on_idle=True):
			
 
				+
			
 
				+#         if wait_for_first_event:
			
 
				+#             event = cls.event_queue.get_next_unclaimed()
			
 
				+#             while not event:
			
 
				+#                 time.sleep(cls.poll_interval)
			
 
				+#                 event = cls.event_queue.get_next_unclaimed()
			
 
				+
			
 
				+#         while True:
			
 
				+#             output_events = list(cls.process_next_event()) or list(cls.process_idle_tick())   # process next event, or tick if idle
			
 
				+#             yield from output_events
			
 
				+#             if not output_events:
			
 
				+#                 if exit_on_idle:
			
 
				+#                     break
			
 
				+#                 else:
			
 
				+#                     time.sleep(cls.poll_interval)
			
 
				+
			
 
				+#     @classmethod
			
 
				+#     def process_next_event(cls) -> Iterable[EventDict]:
			
 
				+#         event = cls.event_queue.get_next_unclaimed()
			
 
				+#         output_events = []
			
 
				+        
			
 
				+#         if not event:
			
 
				+#             return []
			
 
				+        
			
 
				+#         cls.mark_event_claimed(event, duration=60)
			
 
				+#         try:
			
 
				+#             for output_event in cls.receive(event):
			
 
				+#                 output_events.append(output_event)
			
 
				+#                 yield output_event
			
 
				+#             cls.mark_event_succeeded(event, output_events=output_events)
			
 
				+#         except BaseException as e:
			
 
				+#             cls.mark_event_failed(event, output_events=output_events, error=e)
			
 
				+
			
 
				+#     @classmethod
			
 
				+#     def process_idle_tick(cls) -> Iterable[EventDict]:
			
 
				+#         # reset the idle event to be claimed by the current process
			
 
				+#         event, _created = Event.objects.update_or_create(
			
 
				+#             name=f'{cls.event_prefix}IDLE',
			
 
				+#             emitted_by=Process.current(),
			
 
				+#             defaults={
			
 
				+#                 'deliver_at': timezone.now(),
			
 
				+#                 'claimed_proc': None,
			
 
				+#                 'claimed_at': None,
			
 
				+#                 'finished_at': None,
			
 
				+#                 'error': None,
			
 
				+#                 'parent': None,
			
 
				+#             },
			
 
				+#         )
			
 
				+        
			
 
				+#         # then process it like any other event
			
 
				+#         yield from cls.process_next_event()
			
 
				+
			
 
				+#     @classmethod
			
 
				+#     def receive(cls, event: Event) -> Iterable[EventDict]:
			
 
				+#         handler_method = getattr(cls, f'on_{event.name}', None)
			
 
				+#         if handler_method:
			
 
				+#             yield from handler_method(event)
			
 
				+#         else:
			
 
				+#             raise Exception(f'No handler method for event: {event.name}')
			
 
				+
			
 
				+#     @staticmethod
			
 
				+#     def on_IDLE() -> Iterable[EventDict]:
			
 
				+#         return []
			
 
				+    
			
 
				+#     @staticmethod
			
 
				+#     def mark_event_claimed(event: Event, duration: int=60):
			
 
				+#         proc = Process.current()
			
 
				+        
			
 
				+#         with transaction.atomic():
			
 
				+#             claimed = Event.objects.filter(id=event.id, claimed_proc=None, claimed_at=None).update(claimed_proc=proc, claimed_at=timezone.now())
			
 
				+#             if not claimed:
			
 
				+#                 event.refresh_from_db()
			
 
				+#                 raise Exception(f'Event already claimed by another process: {event.claimed_proc}')
			
 
				             
			
 
				-        return obj_statemachine
			
 
				-    
			
 
				-    @classmethod
			
 
				-    def _populate_missing_classvars_from_model(cls, Model: Type[ModelType]):
			
 
				-        """Check that the class variables are set correctly based on the ModelType"""
			
 
				-        
			
 
				-        # check that Model is the same as the Generic[ModelType] parameter in the class definition
			
 
				-        cls.Model = getattr(cls, 'Model', None) or Model
			
 
				-        if cls.Model != Model:
			
 
				-            raise ValueError(f'{cls.__name__}.Model must be set to the same Model as the Generic[ModelType] parameter in the class definition')
			
 
				-        
			
 
				-        # check that Model has a valid StateMachine with the required event defined on it
			
 
				-        cls.StateMachineClass = getattr(cls, 'StateMachineClass', None)      # type: ignore
			
 
				-        assert isinstance(cls.EVENT_NAME, str), f'{cls.__name__}.EVENT_NAME must be a str, got: {type(cls.EVENT_NAME).__name__} instead'
			
 
				-        assert hasattr(cls.StateMachineClass, cls.EVENT_NAME), f'StateMachine {cls.StateMachineClass.__name__} must define a {cls.EVENT_NAME} event ({cls.__name__}.EVENT_NAME = {cls.EVENT_NAME})'
			
 
				-        
			
 
				-        # check that Model uses .id as its primary key field
			
 
				-        primary_key_field = cls.Model._meta.pk.name
			
 
				-        if primary_key_field != 'id':
			
 
				-            raise NotImplementedError(f'Actors currently only support models that use .id as their primary key field ({cls.__name__} uses {cls.__name__}.{primary_key_field} as primary key)')
			
 
				-        
			
 
				-        # check that ACTIVE_STATE is defined and that it exists on the StateMachineClass
			
 
				-        if not getattr(cls, 'ACTIVE_STATE', None):
			
 
				-            raise NotImplementedError(f'{cls.__name__} must define an ACTIVE_STATE: ClassVar[State] (e.g. SnapshotMachine.started) ({cls.Model.__name__}.{cls.Model.state_field_name} gets set to this value to mark objects as actively processing)')
			
 
				-        assert isinstance(cls.ACTIVE_STATE, (State, str)) and hasattr(cls.StateMachineClass, cls._state_to_str(cls.ACTIVE_STATE)), f'{cls.__name__}.ACTIVE_STATE must be a statemachine.State | str that exists on {cls.StateMachineClass.__name__}, got: {type(cls.ACTIVE_STATE).__name__} instead'
			
 
				-        
			
 
				-        # check the other ClassVar attributes for valid values
			
 
				-        assert cls.CLAIM_ORDER and isinstance(cls.CLAIM_ORDER, tuple) and all(isinstance(order, str) for order in cls.CLAIM_ORDER), f'{cls.__name__}.CLAIM_ORDER must be a non-empty tuple[str, ...], got: {type(cls.CLAIM_ORDER).__name__} instead'
			
 
				-        assert cls.CLAIM_FROM_TOP_N > 0, f'{cls.__name__}.CLAIM_FROM_TOP_N must be a positive int, got: {cls.CLAIM_FROM_TOP_N} instead'
			
 
				-        assert cls.MAX_TICK_TIME >= 1, f'{cls.__name__}.MAX_TICK_TIME must be a positive int > 1, got: {cls.MAX_TICK_TIME} instead'
			
 
				-        assert cls.MAX_CONCURRENT_ACTORS >= 1, f'{cls.__name__}.MAX_CONCURRENT_ACTORS must be a positive int >=1, got: {cls.MAX_CONCURRENT_ACTORS} instead'
			
 
				-        assert isinstance(cls.CLAIM_ATOMIC, bool), f'{cls.__name__}.CLAIM_ATOMIC must be a bool, got: {cls.CLAIM_ATOMIC} instead'
			
 
				-
			
 
				-    # @classmethod
			
 
				-    # def _fork_actor_as_thread(cls, **launch_kwargs: LaunchKwargs) -> int:
			
 
				-    #     """Spawn a new background thread running the actor's runloop"""
			
 
				-    #     actor = cls(mode='thread', **launch_kwargs)
			
 
				-    #     bg_actor_thread = Thread(target=actor.runloop)
			
 
				-    #     bg_actor_thread.start()
			
 
				-    #     assert bg_actor_thread.native_id is not None
			
 
				-    #     return bg_actor_thread.native_id
			
 
				-    
			
 
				-    @classmethod
			
 
				-    def _fork_actor_as_process(cls, **launch_kwargs: LaunchKwargs) -> int:
			
 
				-        """Spawn a new background process running the actor's runloop"""
			
 
				-        actor = cls(mode='process', **launch_kwargs)
			
 
				-        bg_actor_process = Process(target=actor.runloop)
			
 
				-        bg_actor_process.start()
			
 
				-        assert bg_actor_process.pid is not None
			
 
				-        cls._SPAWNED_ACTOR_PIDS.append(psutil.Process(pid=bg_actor_process.pid))
			
 
				-        return bg_actor_process.pid
			
 
				-    
			
 
				-    @classmethod
			
 
				-    def _obj_repr(cls, obj: ModelType | Any) -> str:
			
 
				-        """Get a string representation of the given django Model instance"""
			
 
				-        return f'[grey53]{type(obj).__name__}\\[{obj.ABID}][/grey53]'
			
 
				-    
			
 
				-    ### Class Methods: Called by Orchestrator on ActorType class before it has been spawned
			
 
				-    
			
 
				-    @classmethod
			
 
				-    def get_running_actors(cls) -> list[int]:
			
 
				-        """returns a list of pids of all running actors of this type"""
			
 
				-        # WARNING: only works for process actors, not thread actors
			
 
				-        if cls.mode == 'thread':
			
 
				-            raise NotImplementedError('get_running_actors() is not implemented for thread actors')
			
 
				-        return [
			
 
				-            proc.pid for proc in cls._SPAWNED_ACTOR_PIDS
			
 
				-            if proc.is_running() and proc.status() != 'zombie'
			
 
				-        ]
			
 
				-        
			
 
				-    @classmethod
			
 
				-    def get_actors_to_spawn(cls, queue: QuerySet, running_actors: list[int]) -> list[LaunchKwargs]:
			
 
				-        """Get a list of launch kwargs for the number of actors to spawn based on the queue and currently running actors"""
			
 
				-        queue_length = queue.count()
			
 
				-        if not queue_length:                                      # queue is empty, spawn 0 actors
			
 
				-            return []
			
 
				-        
			
 
				-        # WARNING:
			
 
				-        # spawning new actors processes is slow/expensive, avoid spawning many actors at once in a single orchestrator tick.
			
 
				-        # limit to spawning 1 or 2 at a time per orchestrator tick, and let the next tick handle starting another couple.
			
 
				-        # DONT DO THIS:
			
 
				-        # if queue_length > 20:                      # queue is extremely long, spawn maximum actors at once!
			
 
				-        #   num_to_spawn_this_tick = cls.MAX_CONCURRENT_ACTORS
			
 
				-        
			
 
				-        if queue_length > 10:    
			
 
				-            num_to_spawn_this_tick = 2  # spawn more actors per tick if queue is long
			
 
				-        else:
			
 
				-            num_to_spawn_this_tick = 1  # spawn fewer actors per tick if queue is short
			
 
				-        
			
 
				-        num_remaining = cls.MAX_CONCURRENT_ACTORS - len(running_actors)
			
 
				-        num_to_spawn_now: int = limit(num_to_spawn_this_tick, num_remaining)
			
 
				-        
			
 
				-        actors_launch_kwargs: list[LaunchKwargs] = num_to_spawn_now * [{**cls.launch_kwargs}]
			
 
				-        return actors_launch_kwargs
			
 
				-        
			
 
				-    @classmethod
			
 
				-    def start(cls, mode: Literal['thread', 'process']='process', **launch_kwargs: LaunchKwargs) -> int:
			
 
				-        if mode == 'thread':
			
 
				-            raise NotImplementedError('Thread-based actors are disabled to reduce codebase complexity. Please use processes for everything')
			
 
				-            # return cls._fork_actor_as_thread(**launch_kwargs)
			
 
				-        elif mode == 'process':
			
 
				-            return cls._fork_actor_as_process(**launch_kwargs)
			
 
				-        raise ValueError(f'Invalid actor mode: {mode} must be "thread" or "process"')
			
 
				-    
			
 
				-    @classproperty
			
 
				-    def qs(cls) -> QuerySet[ModelType]:
			
 
				-        """
			
 
				-        Get the unfiltered and unsorted QuerySet of all objects that this Actor might care about.
			
 
				-        Override this in the subclass to define the QuerySet of objects that the Actor is going to poll for new work.
			
 
				-        (don't limit, order, or filter this by retry_at or status yet, Actor.get_queue() handles that part)
			
 
				-        """
			
 
				-        return cls.Model.objects.filter()
			
 
				-    
			
 
				-    @classproperty
			
 
				-    def final_q(cls) -> Q:
			
 
				-        """Get the filter for objects that are already completed / in a final state"""
			
 
				-        return Q(**{
			
 
				-            f'{cls.Model.state_field_name}__in': [cls._state_to_str(s) for s in cls.StateMachineClass.final_states],
			
 
				-        })  # status__in=('sealed', 'failed', 'succeeded')
			
 
				-    
			
 
				-    @classproperty
			
 
				-    def active_q(cls) -> Q:
			
 
				-        """Get the filter for objects that are marked active (and are still running / not timed out)"""
			
 
				-        return Q(retry_at__gte=timezone.now(), **{cls.Model.state_field_name: cls._state_to_str(cls.ACTIVE_STATE)})   # e.g. Q(status='started')
			
 
				-    
			
 
				-    @classproperty
			
 
				-    def stalled_q(cls) -> Q:
			
 
				-        """Get the filter for objects that are marked active but are timed out"""
			
 
				-        return Q(retry_at__lte=timezone.now(), **{cls.Model.state_field_name: cls._state_to_str(cls.ACTIVE_STATE)})                     # e.g. Q(status='started') AND Q(<retry_at is in the past>)
			
 
				-    
			
 
				-    @classproperty
			
 
				-    def future_q(cls) -> Q:
			
 
				-        """Get the filter for objects that have a retry_at in the future"""
			
 
				-        return Q(retry_at__gt=timezone.now(), **{cls.Model.state_field_name: 'QUEUED'})
			
 
				-    
			
 
				-    @classproperty
			
 
				-    def pending_q(cls) -> Q:
			
 
				-        """Get the filter for objects that are ready for processing."""
			
 
				-        return ~Q(**{
			
 
				-            f'{cls.Model.state_field_name}__in': (*[cls._state_to_str(s) for s in cls.StateMachineClass.final_states], cls._state_to_str(cls.ACTIVE_STATE))
			
 
				-        })  # status__not_in=('sealed', 'failed', 'succeeded', 'started')
			
 
				-    
			
 
				-    @classmethod
			
 
				-    def get_queue(cls, sort: bool=True) -> QuerySet[ModelType]:
			
 
				-        """
			
 
				-        Get the sorted and filtered QuerySet of objects that are ready for processing.
			
 
				-        e.g. qs.exclude(status__in=('sealed', 'started'), retry_at__gt=timezone.now()).order_by('retry_at')
			
 
				-        """
			
 
				-        unsorted_qs = cls.qs.filter(cls.pending_q) | cls.qs.filter(cls.stalled_q)
			
 
				-        return unsorted_qs.order_by(*cls.CLAIM_ORDER) if sort else unsorted_qs
			
 
				-
			
 
				-    ### Instance Methods: Only called from within Actor instance after it has been spawned (i.e. forked as a thread or process)
			
 
				-    
			
 
				-    def runloop(self):
			
 
				-        """The main runloop that starts running when the actor is spawned (as subprocess or thread) and exits when the queue is empty"""
			
 
				-        self.on_startup()
			
 
				-        obj_to_process: ModelType | None = None
			
 
				-        last_error: BaseException | None = None
			
 
				-        try:
			
 
				-            while True:
			
 
				-                # Get the next object to process from the queue
			
 
				-                try:
			
 
				-                    obj_to_process = cast(ModelType, self.get_next(atomic=self.CLAIM_ATOMIC))
			
 
				-                except (ActorQueueIsEmpty, ActorObjectAlreadyClaimed) as err:
			
 
				-                    last_error = err
			
 
				-                    obj_to_process = None
			
 
				-                
			
 
				-                # Handle the case where there is no next object to process
			
 
				-                if obj_to_process:
			
 
				-                    self.idle_count = 0   # reset idle count if we got an object
			
 
				-                else:
			
 
				-                    if self.idle_count >= 3:
			
 
				-                        break             # stop looping and exit if queue is empty and we have idled for 30sec
			
 
				-                    else:
			
 
				-                        # print('Actor runloop()', f'pid={self.pid}', 'queue empty, rechecking...')
			
 
				-                        self.idle_count += 1
			
 
				-                        time.sleep(1)
			
 
				-                        continue
			
 
				-                
			
 
				-                # Process the object by triggering its StateMachine.tick() method
			
 
				-                self.on_tick_start(obj_to_process)
			
 
				-                try:
			
 
				-                    self.tick(obj_to_process)
			
 
				-                except Exception as err:
			
 
				-                    last_error = err
			
 
				-                    print(f'[red]{self._obj_repr(obj_to_process)} 🏃‍♂️ {self}.tick()[/red] ERROR: [red]{type(err).__name__}: {err}[/red]')
			
 
				-                    db.connections.close_all()                         # always reset the db connection after an exception to clear any pending transactions
			
 
				-                    self.on_tick_exception(obj_to_process, err)
			
 
				-                    traceback.print_exc()
			
 
				-                finally:
			
 
				-                    self.on_tick_end(obj_to_process)
			
 
				-
			
 
				-        except BaseException as err:
			
 
				-            last_error = err
			
 
				-            if isinstance(err, KeyboardInterrupt):
			
 
				-                print()
			
 
				-            else:
			
 
				-                print(f'\n[red]{self._obj_repr(obj_to_process)} 🏃‍♂️ {self}.runloop() FATAL:[/red] {type(err).__name__}: {err}')
			
 
				-                print(f'    Last processed object: {obj_to_process}')
			
 
				-                raise
			
 
				-        finally:
			
 
				-            self.on_shutdown(last_obj=obj_to_process, last_error=last_error)
			
 
				-    
			
 
				-    @classmethod
			
 
				-    def get_update_kwargs_to_claim_obj(cls) -> dict[str, Any]:
			
 
				-        """
			
 
				-        Get the field values needed to mark an pending obj_to_process as being actively processing (aka claimed)
			
 
				-        by the current Actor. returned kwargs will be applied using: qs.filter(id=obj_to_process.id).update(**kwargs).
			
 
				-        F() expressions are allowed in field values if you need to update a field based on its current value.
			
 
				-        Can be a defined as a normal method (instead of classmethod) on subclasses if it needs to access instance vars.
			
 
				-        """
			
 
				-        return {
			
 
				-            # cls.Model.state_field_name: cls._state_to_str(cls.ACTIVE_STATE),   # do this manually in the state machine enter hooks
			
 
				-            'retry_at': timezone.now() + timedelta(seconds=cls.MAX_TICK_TIME),
			
 
				-        }
			
 
				-    
			
 
				-    def get_next(self, atomic: bool | None=None) -> ModelType | None:
			
 
				-        """get the next object from the queue, atomically locking it if self.CLAIM_ATOMIC=True"""
			
 
				-        atomic = self.CLAIM_ATOMIC if atomic is None else atomic
			
 
				-        if atomic:
			
 
				-            # fetch and claim the next object from in the queue in one go atomically
			
 
				-            obj = self.get_next_atomic()
			
 
				-        else:
			
 
				-            # two-step claim: fetch the next object and lock it in a separate query
			
 
				-            obj = self.get_next_non_atomic()
			
 
				-        return obj
			
 
				-    
			
 
				-    def get_next_non_atomic(self) -> ModelType:
			
 
				-        """
			
 
				-        Naiively selects the top/first object from self.get_queue().order_by(*self.CLAIM_ORDER),
			
 
				-        then claims it by running .update(status='started', retry_at=<now + MAX_TICK_TIME>).
			
 
				-        
			
 
				-        Do not use this method if there is more than one Actor racing to get objects from the same queue,
			
 
				-        it will be slow/buggy as they'll compete to lock the same object at the same time (TOCTTOU race).
			
 
				-        """
			
 
				-        obj = self.get_queue().first()
			
 
				-        if obj is None:
			
 
				-            raise ActorQueueIsEmpty(f'No next object available in {self}.get_queue()')
			
 
				-        
			
 
				-        locked = self.get_queue().filter(id=obj.id).update(**self.get_update_kwargs_to_claim_obj())
			
 
				-        if not locked:
			
 
				-            raise ActorObjectAlreadyClaimed(f'Unable to lock the next {self.Model.__name__} object from {self}.get_queue().first()')
			
 
				-        return obj
			
 
				-        
			
 
				-    def get_next_atomic(self) -> ModelType | None:
			
 
				-        """
			
 
				-        Selects the top n=50 objects from the queue and atomically claims a random one from that set.
			
 
				-        This approach safely minimizes contention with other Actors trying to select from the same Queue.
			
 
				-
			
 
				-        The atomic query is roughly equivalent to the following:  (all done in one SQL query to avoid a TOCTTOU race)
			
 
				-            top_candidates are selected from:   qs.order_by(*CLAIM_ORDER).only('id')[:CLAIM_FROM_TOP_N]
			
 
				-            a single candidate is chosen using: qs.filter(id__in=top_n_candidates).order_by('?').first()
			
 
				-            the chosen obj is claimed using:    qs.filter(id=chosen_obj).update(status=ACTIVE_STATE, retry_at=<now + MAX_TICK_TIME>)
			
 
				-        """
			
 
				-        # TODO: if we switch from SQLite to PostgreSQL in the future, we should change this
			
 
				-        # to use SELECT FOR UPDATE instead of a subquery + ORDER BY RANDOM() LIMIT 1
			
 
				-        
			
 
				-        # e.g. SELECT id FROM core_archiveresult WHERE status NOT IN (...) AND retry_at <= '...' ORDER BY retry_at ASC LIMIT 50
			
 
				-        qs = self.get_queue()
			
 
				-        select_top_canidates_sql, select_params = self._sql_for_select_top_n_candidates(qs=qs)
			
 
				-        assert select_top_canidates_sql.startswith('SELECT ')
			
 
				-        
			
 
				-        # e.g. UPDATE core_archiveresult SET status='%s', retry_at='%s' WHERE status NOT IN (...) AND retry_at <= '...'
			
 
				-        update_claimed_obj_sql, update_params = self._sql_for_update_claimed_obj(qs=self.qs.all(), update_kwargs=self.get_update_kwargs_to_claim_obj())
			
 
				-        assert update_claimed_obj_sql.startswith('UPDATE ') and 'WHERE' not in update_claimed_obj_sql
			
 
				-        db_table = self.Model._meta.db_table  # e.g. core_archiveresult
			
 
				-        
			
 
				-        # subquery gets the pool of the top candidates e.g. self.get_queue().only('id')[:CLAIM_FROM_TOP_N]
			
 
				-        # main query selects a random one from that pool, and claims it using .update(status=ACTIVE_STATE, retry_at=<now + MAX_TICK_TIME>)
			
 
				-        # this is all done in one atomic SQL query to avoid TOCTTOU race conditions (as much as possible)
			
 
				-        atomic_select_and_update_sql = f"""
			
 
				-            with top_candidates AS ({select_top_canidates_sql})
			
 
				-            {update_claimed_obj_sql}
			
 
				-            WHERE "{db_table}"."id" IN (
			
 
				-                SELECT id FROM top_candidates
			
 
				-                ORDER BY RANDOM()
			
 
				-                LIMIT 1
			
 
				-            )
			
 
				-            RETURNING *;
			
 
				-        """
			
 
				-        
			
 
				-        # import ipdb; ipdb.set_trace()
			
 
				-
			
 
				-        try:
			
 
				-            updated = qs.raw(atomic_select_and_update_sql, (*select_params, *update_params))
			
 
				-            assert len(updated) <= 1, f'Expected to claim at most 1 object, but Django modified {len(updated)} objects!'
			
 
				-            return updated[0]
			
 
				-        except IndexError:
			
 
				-            if self.get_queue().exists():
			
 
				-                raise ActorObjectAlreadyClaimed(f'Unable to lock the next {self.Model.__name__} object from {self}.get_queue().first()')
			
 
				-            else:
			
 
				-                raise ActorQueueIsEmpty(f'No next object available in {self}.get_queue()')
			
 
				-
			
 
				-    def tick(self, obj_to_process: ModelType) -> None:
			
 
				-        """Call the object.sm.tick() method to process the object"""
			
 
				-        print(f'\n[grey53]{self._obj_repr(obj_to_process)} 🏃‍♂️ {self}.tick()[/grey53] [blue]{obj_to_process.status.upper()}[/blue] ➡️ ...  +{(obj_to_process.retry_at - timezone.now()).total_seconds() if obj_to_process.retry_at else "-"}s')
			
 
				-        
			
 
				-        # get the StateMachine instance from the object
			
 
				-        obj_statemachine = self._get_state_machine_instance(obj_to_process)
			
 
				-        starting_state = obj_statemachine.current_state
			
 
				-        
			
 
				-        # trigger the event on the StateMachine instance
			
 
				-        obj_tick_method = getattr(obj_statemachine, self.EVENT_NAME)  # e.g. obj_statemachine.tick()
			
 
				-        obj_tick_method()
			
 
				-        
			
 
				-        ending_state = obj_statemachine.current_state
			
 
				-        if starting_state != ending_state:
			
 
				-            self.on_state_change(obj_to_process, starting_state, ending_state)
			
 
				-        
			
 
				-        # save the object to persist any state changes
			
 
				-        obj_to_process.save()
			
 
				-        
			
 
				-    def on_startup(self) -> None:
			
 
				-        if self.mode == 'thread':
			
 
				-            # self.pid = get_native_id()  # thread id
			
 
				-            print(f'[green]🏃‍♂️ {self}.on_startup() STARTUP (THREAD)[/green]')
			
 
				-            raise NotImplementedError('Thread-based actors are disabled to reduce codebase complexity. Please use processes for everything')
			
 
				-        else:
			
 
				-            self.pid = os.getpid()      # process id
			
 
				-            print(f'[green]🏃‍♂️ {self}.on_startup() STARTUP (PROCESS)[/green]')
			
 
				-        # abx.pm.hook.on_actor_startup(actor=self)
			
 
				-        
			
 
				-    def on_shutdown(self, last_obj: ModelType | None=None, last_error: BaseException | None=None) -> None:
			
 
				-        # if isinstance(last_error, KeyboardInterrupt) or last_error is None:
			
 
				-        #     last_error_str = '[green](CTRL-C)[/green]'
			
 
				-        # elif isinstance(last_error, ActorQueueIsEmpty):
			
 
				-        #     last_error_str = '[green](queue empty)[/green]'
			
 
				-        # elif isinstance(last_error, ActorObjectAlreadyClaimed):
			
 
				-        #     last_error_str = '[green](queue race)[/green]'
			
 
				-        # else:
			
 
				-        #     last_error_str = f'[red]{type(last_error).__name__}: {last_error}[/red]'
			
 
				-
			
 
				-        # print(f'[grey53]🏃‍♂️ {self}.on_shutdown() SHUTTING DOWN[/grey53] {last_error_str}')
			
 
				-        # abx.pm.hook.on_actor_shutdown(actor=self, last_obj=last_obj, last_error=last_error)
			
 
				-        pass
			
 
				-        
			
 
				-    def on_tick_start(self, obj_to_process: ModelType) -> None:
			
 
				-        # print(f'🏃‍♂️ {self}.on_tick_start() {obj_to_process.ABID} {obj_to_process.status} {obj_to_process.retry_at}')
			
 
				-        # abx.pm.hook.on_actor_tick_start(actor=self, obj_to_process=obj)
			
 
				-        # self.timer = TimedProgress(self.MAX_TICK_TIME, prefix='      ')
			
 
				-        pass
			
 
				-    
			
 
				-    def on_tick_end(self, obj_to_process: ModelType) -> None:
			
 
				-        # print(f'🏃‍♂️ {self}.on_tick_end() {obj_to_process.ABID} {obj_to_process.status} {obj_to_process.retry_at}')
			
 
				-        # abx.pm.hook.on_actor_tick_end(actor=self, obj_to_process=obj_to_process)
			
 
				-        # self.timer.end()
			
 
				-        pass
			
 
				-        
			
 
				-        # import ipdb; ipdb.set_trace()
			
 
				-
			
 
				-    
			
 
				-    def on_tick_exception(self, obj_to_process: ModelType, error: Exception) -> None:
			
 
				-        print(f'[red]{self._obj_repr(obj_to_process)} 🏃‍♂️ {self}.on_tick_exception()[/red] [blue]{obj_to_process.status}[/blue] +{(obj_to_process.retry_at - timezone.now()).total_seconds() if obj_to_process.retry_at else "-"}s: [red]{type(error).__name__}: {error}[/red]')
			
 
				-        # abx.pm.hook.on_actor_tick_exception(actor=self, obj_to_process=obj_to_process, error=error)
			
 
				-
			
 
				-    def on_state_change(self, obj_to_process: ModelType, starting_state, ending_state) -> None:
			
 
				-        print(f'[blue]{self._obj_repr(obj_to_process)} 🏃‍♂️ {self}.on_state_change() {starting_state} ➡️ {ending_state}[/blue] +{(obj_to_process.retry_at - timezone.now()).total_seconds() if obj_to_process.retry_at else "-"}s')
			
 
				-        # abx.pm.hook.on_actor_state_change(actor=self, obj_to_process=obj_to_process, starting_state=starting_state, ending_state=ending_state)
			
 
				-
			
 
				-
			
 
				-def compile_sql_select(queryset: QuerySet, filter_kwargs: dict[str, Any] | None=None, order_args: tuple[str, ...]=(), limit: int | None=None) -> tuple[str, tuple[Any, ...]]:
			
 
				-    """
			
 
				-    Compute the SELECT query SQL for a queryset.filter(**filter_kwargs).order_by(*order_args)[:limit] call
			
 
				-    Returns a tuple of (sql, params) where sql is a template string containing %s (unquoted) placeholders for the params
			
 
				-    
			
 
				-    WARNING:
			
 
				-    final_sql = sql % params  DOES NOT WORK to assemble the final SQL string because the %s placeholders are not quoted/escaped
			
 
				-    they should always passed separately to the DB driver so it can do its own quoting/escaping to avoid SQL injection and syntax errors
			
 
				-    """
			
 
				-    assert isinstance(queryset, QuerySet), f'compile_sql_select(...) first argument must be a QuerySet, got: {type(queryset).__name__} instead'
			
 
				-    assert filter_kwargs is None or isinstance(filter_kwargs, dict), f'compile_sql_select(...) filter_kwargs argument must be a dict[str, Any], got: {type(filter_kwargs).__name__} instead'
			
 
				-    assert isinstance(order_args, tuple) and all(isinstance(arg, str) for arg in order_args), f'compile_sql_select(...) order_args argument must be a tuple[str, ...] got: {type(order_args).__name__} instead'
			
 
				-    assert limit is None or isinstance(limit, int), f'compile_sql_select(...) limit argument must be an int, got: {type(limit).__name__} instead'
			
 
				-    
			
 
				-    queryset = queryset._chain()                      # type: ignore   # copy queryset to avoid modifying the original
			
 
				-    if filter_kwargs:
			
 
				-        queryset = queryset.filter(**filter_kwargs)
			
 
				-    if order_args:
			
 
				-        queryset = queryset.order_by(*order_args)
			
 
				-    if limit is not None:
			
 
				-        queryset = queryset[:limit]
			
 
				-    query = queryset.query
			
 
				-    
			
 
				-    # e.g. SELECT id FROM core_archiveresult WHERE status NOT IN (%s, %s, %s) AND retry_at <= %s ORDER BY retry_at ASC LIMIT 50
			
 
				-    select_sql, select_params = query.get_compiler(queryset.db).as_sql()
			
 
				-    return select_sql, select_params
			
 
				-
			
 
				-
			
 
				-def compile_sql_update(queryset: QuerySet, update_kwargs: dict[str, Any]) -> tuple[str, tuple[Any, ...]]:
			
 
				-    """
			
 
				-    Compute the UPDATE query SQL for a queryset.filter(**filter_kwargs).update(**update_kwargs) call
			
 
				-    Returns a tuple of (sql, params) where sql is a template string containing %s (unquoted) placeholders for the params
			
 
				-    
			
 
				-    Based on the django.db.models.QuerySet.update() source code, but modified to return the SQL instead of executing the update
			
 
				-    https://github.com/django/django/blob/611bf6c2e2a1b4ab93273980c45150c099ab146d/django/db/models/query.py#L1217
			
 
				-    
			
 
				-    WARNING:
			
 
				-    final_sql = sql % params  DOES NOT WORK to assemble the final SQL string because the %s placeholders are not quoted/escaped
			
 
				-    they should always passed separately to the DB driver so it can do its own quoting/escaping to avoid SQL injection and syntax errors
			
 
				-    """
			
 
				-    assert isinstance(queryset, QuerySet), f'compile_sql_update(...) first argument must be a QuerySet, got: {type(queryset).__name__} instead'
			
 
				-    assert isinstance(update_kwargs, dict), f'compile_sql_update(...) update_kwargs argument must be a dict[str, Any], got: {type(update_kwargs).__name__} instead'
			
 
				-    
			
 
				-    queryset = queryset._chain().all()                # type: ignore   # copy queryset to avoid modifying the original and clear any filters
			
 
				-    queryset.query.clear_ordering(force=True)                          # clear any ORDER BY clauses
			
 
				-    queryset.query.clear_limits()                                      # clear any LIMIT clauses aka slices[:n]
			
 
				-    queryset._for_write = True                        # type: ignore
			
 
				-    query = queryset.query.chain(sql.UpdateQuery)     # type: ignore
			
 
				-    query.add_update_values(update_kwargs)            # type: ignore
			
 
				-    query.annotations = {}                                             # clear any annotations
			
 
				-    
			
 
				-    # e.g. UPDATE core_archiveresult SET status='%s', retry_at='%s' WHERE status NOT IN (%s, %s, %s) AND retry_at <= %s
			
 
				-    update_sql, update_params = query.get_compiler(queryset.db).as_sql()
			
 
				-    
			
 
				-    # make sure you only pass a raw queryset with no .filter(...) clauses applied to it, the return value is designed to used
			
 
				-    # in a manually assembled SQL query with its own WHERE clause later on
			
 
				-    assert 'WHERE' not in update_sql, f'compile_sql_update(...) should only contain a SET statement but it tried to return a query with a WHERE clause: {update_sql}'
			
 
				-    
			
 
				-    # print(update_sql, update_params)
			
 
				-
			
 
				-    return update_sql, update_params
			
 
				+#             process_updated = Process.objects.filter(id=proc.id, active_event=None).update(active_event=event)
			
 
				+#             if not process_updated:
			
 
				+#                 raise Exception(f'Unable to update process.active_event: {proc}.active_event = {event}')
			
 
				+
			
 
				+#     @staticmethod
			
 
				+#     def mark_event_succeeded(event: Event, output_events: Iterable[EventDict]):
			
 
				+#         assert event.claimed_proc and (event.claimed_proc == Process.current())
			
 
				+#         with transaction.atomic():
			
 
				+#             updated = Event.objects.filter(id=event.id, claimed_proc=event.claimed_proc, claimed_at=event.claimed_at, finished_at=None).update(finished_at=timezone.now())
			
 
				+#             if not updated:
			
 
				+#                 event.refresh_from_db()
			
 
				+#                 raise Exception(f'Event {event} failed to mark as succeeded, it was modified by another process: {event.claimed_proc}')
			
 
				+
			
 
				+#             process_updated = Process.objects.filter(id=event.claimed_proc.id, active_event=event).update(active_event=None)
			
 
				+#             if not process_updated:
			
 
				+#                 raise Exception(f'Unable to unset process.active_event: {event.claimed_proc}.active_event = {event}')
			
 
				+
			
 
				+#         # dispatch any output events
			
 
				+#         for output_event in output_events:
			
 
				+#             Event.dispatch(event=output_event, parent=event)
			
 
				+
			
 
				+#         # trigger any callback events
			
 
				+#         if event.on_success:
			
 
				+#             Event.dispatch(event=event.on_success, parent=event)
			
 
				+
			
 
				+#     @staticmethod
			
 
				+#     def mark_event_failed(event: Event, output_events: Iterable[EventDict]=(), error: BaseException | None = None):
			
 
				+#         assert event.claimed_proc and (event.claimed_proc == Process.current())
			
 
				+#         with transaction.atomic():
			
 
				+#             updated = event.objects.filter(id=event.id, claimed_proc=event.claimed_proc, claimed_at=event.claimed_at, finished_at=None).update(finished_at=timezone.now(), error=str(error))
			
 
				+#             if not updated:
			
 
				+#                 event.refresh_from_db()
			
 
				+#                 raise Exception(f'Event {event} failed to mark as failed, it was modified by another process: {event.claimed_proc}')
			
 
				+
			
 
				+#             process_updated = Process.objects.filter(id=event.claimed_proc.id, active_event=event).update(active_event=None)
			
 
				+#             if not process_updated:
			
 
				+#                 raise Exception(f'Unable to unset process.active_event: {event.claimed_proc}.active_event = {event}')
			
 
				+
			
 
				+        
			
 
				+#         # add dedicated error event to the output events
			
 
				+#         output_events = [
			
 
				+#             *output_events,
			
 
				+#             {'name': f'{event.name}_ERROR', 'error': f'{type(error).__name__}: {error}'},
			
 
				+#         ]
			
 
				+        
			
 
				+#         # dispatch any output events
			
 
				+#         for output_event in output_events:
			
 
				+#             Event.dispatch(event=output_event, parent=event)
			
 
				+        
			
 
				+#         # trigger any callback events
			
 
				+#         if event.on_failure:
			
 
				+#             Event.dispatch(event=event.on_failure, parent=event)