瀏覽代碼

fix ABID and uniqueness for new Seed models

Nick Sweeting 1 年之前
父節點
當前提交
2ebd28aebd
共有 1 個文件被更改,包括 41 次插入13 次删除
  1. 41 13
      archivebox/crawls/models.py

+ 41 - 13
archivebox/crawls/models.py

@@ -28,21 +28,32 @@ from ..extractors import EXTRACTOR_CHOICES
 
 class Seed(ABIDModel, ModelWithHealthStats):
     """
-    A fountain that produces URLs (+metadata) e.g.
-        - file://data/sources/2024-01-02_11-57-51__cli_add.txt
-        - file://data/sources/2024-01-02_11-57-51__web_ui_add.txt
+    A fountain that produces URLs (+metadata) each time it's queried e.g.
+        - file:///data/sources/2024-01-02_11-57-51__cli_add.txt
+        - file:///data/sources/2024-01-02_11-57-51__web_ui_add.txt
         - file:///Users/squash/Library/Application Support/Google/Chrome/Default/Bookmarks
         - https://getpocket.com/user/nikisweeting/feed
+        - https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
         - ...
+    Each query of a Seed can produce the same list of URLs, or a different list each time.
+    The list of URLs it returns is used to create a new Crawl and seed it with new pending Snapshots.
         
-    When a crawl is created, a root_snapshot is initially created whos URI is the Seed URI.
-    The seed's preferred extractor is executed on the Snapshot, which produces an ArchiveResult.
-    The ArchiveResult (ideally) then contains some outlink URLs, which get turned into new Snapshots.
-    Then the cycle repeats up until Crawl.max_depth.
+    When a crawl is created, a root_snapshot is initially created with a URI set to the Seed URI.
+    The seed's preferred extractor is executed on that URI, which produces an ArchiveResult containing outlinks.
+    The outlinks then get turned into new pending Snapshots under the same crawl,
+    and the cycle repeats until Crawl.max_depth.
 
     Each consumption of a Seed by an Extractor can produce new urls, as Seeds can point to
-    stateful remote services, files whos contents change, etc.
+    stateful remote services, files with contents that change, directories that have new files within, etc.
     """
+    
+    abid_prefix = 'src_'
+    abid_ts_src = 'self.created_at'
+    abid_uri_src = 'self.uri'
+    abid_subtype_src = 'self.extractor'
+    abid_rand_src = 'self.id'
+    abid_drift_allowed = True
+    
     uri = models.URLField(max_length=255, blank=False, null=False, unique=True)              # unique source location where URLs will be loaded from
     
     extractor = models.CharField(choices=EXTRACTOR_CHOICES, default='auto', max_length=32)   # suggested extractor to use to load this URL source
@@ -60,7 +71,7 @@ class Seed(ABIDModel, ModelWithHealthStats):
         #      pocketapi://
         #      s3://
         #      etc..
-        return self.uri.split(':')[0]
+        return self.uri.split('://')[0].lower()
 
 
 class CrawlSchedule(ABIDModel, ModelWithHealthStats):
@@ -72,8 +83,8 @@ class CrawlSchedule(ABIDModel, ModelWithHealthStats):
     """
     abid_prefix = 'sch_'
     abid_ts_src = 'self.created_at'
-    abid_uri_src = 'self.crawl.abid'
-    abid_subtype_src = '"04"'
+    abid_uri_src = 'self.created_by_id'
+    abid_subtype_src = 'self.schedule'
     abid_rand_src = 'self.id'
     
     schedule = models.CharField(max_length=64, blank=False, null=False)
@@ -82,6 +93,13 @@ class CrawlSchedule(ABIDModel, ModelWithHealthStats):
     created_at = AutoDateTimeField(default=None, null=False, db_index=True)
     modified_at = models.DateTimeField(auto_now=True)
     created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
+    
+    crawl_set: models.Manager['Crawl']
+    
+    @property
+    def template(self):
+        """The base crawl that each new scheduled job should copy as a template"""
+        return self.crawl_set.first()
 
 
 class Crawl(ABIDModel, ModelWithHealthStats):
@@ -94,7 +112,7 @@ class Crawl(ABIDModel, ModelWithHealthStats):
     """
     abid_prefix = 'crl_'
     abid_ts_src = 'self.created_at'
-    abid_uri_src = 'self.seed_id'
+    abid_uri_src = 'self.seed.uri'
     abid_subtype_src = 'self.persona_id'
     abid_rand_src = 'self.id'
     abid_drift_allowed = True
@@ -125,6 +143,13 @@ class Crawl(ABIDModel, ModelWithHealthStats):
     class Meta(TypedModelMeta):
         verbose_name = 'Crawl'
         verbose_name_plural = 'Crawls'
+        
+    @property
+    def template(self):
+        """If this crawl was created under a ScheduledCrawl, returns the original template Crawl it was based off"""
+        if not self.schedule:
+            return None
+        return self.schedule.template
 
     @property
     def api_url(self) -> str:
@@ -138,6 +163,7 @@ class Crawl(ABIDModel, ModelWithHealthStats):
 
 
 class Outlink(models.Model):
+    """A record of a link found on a page, pointing to another page."""
     id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
     
     src = models.URLField()   # parent page where the outlink/href was found       e.g. https://example.com/downloads
@@ -145,6 +171,8 @@ class Outlink(models.Model):
     
     via = models.ForeignKey(ArchiveResult, related_name='outlink_set')
 
+    class Meta:
+        unique_together = (('src', 'dst', 'via'),)
 
 
 def scheduler_runloop():
@@ -182,7 +210,7 @@ def create_crawl_from_ui_action(urls, extractor, credentials, depth, tags_str, p
 
 
 @abx.hookimpl.on_crawl_schedule_tick
-def create_crawl_from_crawl_schedule_if_due(crawl_schedule):
+def create_crawl_from_crawlschedule_if_due(crawl_schedule):
     # make sure it's not too early to run this scheduled import (makes this function indepmpotent / safe to call multiple times / every second)
     if timezone.now() < crawl_schedule.next_run_at:
         # it's not time to run it yet, wait for the next tick