models.py 2.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. __package__ = 'archivebox.seeds'
  2. from django.db import models
  3. from django.conf import settings
  4. from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats
  5. class Seed(ABIDModel, ModelWithHealthStats):
  6. """
  7. A fountain that produces URLs (+metadata) each time it's queried e.g.
  8. - file:///data/sources/2024-01-02_11-57-51__cli_add.txt
  9. - file:///data/sources/2024-01-02_11-57-51__web_ui_add.txt
  10. - file:///Users/squash/Library/Application Support/Google/Chrome/Default/Bookmarks
  11. - https://getpocket.com/user/nikisweeting/feed
  12. - https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
  13. - ...
  14. Each query of a Seed can produce the same list of URLs, or a different list each time.
  15. The list of URLs it returns is used to create a new Crawl and seed it with new pending Snapshots.
  16. When a crawl is created, a root_snapshot is initially created with a URI set to the Seed URI.
  17. The seed's preferred extractor is executed on that URI, which produces an ArchiveResult containing outlinks.
  18. The outlinks then get turned into new pending Snapshots under the same crawl,
  19. and the cycle repeats until Crawl.max_depth.
  20. Each consumption of a Seed by an Extractor can produce new urls, as Seeds can point to
  21. stateful remote services, files with contents that change, directories that have new files within, etc.
  22. """
  23. abid_prefix = 'src_'
  24. abid_ts_src = 'self.created_at'
  25. abid_uri_src = 'self.uri'
  26. abid_subtype_src = 'self.extractor'
  27. abid_rand_src = 'self.id'
  28. abid_drift_allowed = True
  29. id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
  30. abid = ABIDField(prefix=abid_prefix)
  31. uri = models.URLField(max_length=2000, blank=False, null=False) # unique source location where URLs will be loaded from
  32. extractor = models.CharField(default='auto', max_length=32) # suggested extractor to use to load this URL source
  33. tags_str = models.CharField(max_length=255, null=False, blank=True, default='') # tags to attach to any URLs that come from this source
  34. config = models.JSONField(default=dict) # extra config to put in scope when loading URLs from this source
  35. created_at = AutoDateTimeField(default=None, null=False, db_index=True)
  36. modified_at = models.DateTimeField(auto_now=True)
  37. created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
  38. @property
  39. def source_type(self):
  40. # e.g. http/https://
  41. # file://
  42. # pocketapi://
  43. # s3://
  44. # etc..
  45. return self.uri.split('://', 1)[0].lower()
  46. class Meta:
  47. verbose_name = 'Seed'
  48. verbose_name_plural = 'Seeds'
  49. unique_together = (('created_by', 'uri', 'extractor'),)