models.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345
  1. __package__ = 'archivebox.machine'
  2. import socket
  3. from datetime import timedelta
  4. from pathlib import Path
  5. from django.db import models
  6. from django.utils import timezone
  7. from django.utils.functional import cached_property
  8. import abx
  9. import archivebox
  10. from pydantic_pkgr import Binary, BinProvider
  11. from archivebox.abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats
  12. from .detect import get_host_guid, get_os_info, get_vm_info, get_host_network, get_host_stats
  13. _CURRENT_MACHINE = None # global cache for the current machine
  14. _CURRENT_INTERFACE = None # global cache for the current network interface
  15. _CURRENT_BINARIES = {} # global cache for the currently installed binaries
  16. MACHINE_RECHECK_INTERVAL = 7 * 24 * 60 * 60 # 1 week (how often should we check for OS/hardware changes?)
  17. NETWORK_INTERFACE_RECHECK_INTERVAL = 1 * 60 * 60 # 1 hour (how often should we check for public IP/private IP/DNS changes?)
  18. INSTALLED_BINARY_RECHECK_INTERVAL = 1 * 30 * 60 # 30min (how often should we check for changes to locally installed binaries?)
  19. class MachineManager(models.Manager):
  20. def current(self) -> 'Machine':
  21. """Get the current machine that ArchiveBox is running on."""
  22. global _CURRENT_MACHINE
  23. if _CURRENT_MACHINE:
  24. expires_at = _CURRENT_MACHINE.modified_at + timedelta(seconds=MACHINE_RECHECK_INTERVAL)
  25. if timezone.now() < expires_at:
  26. # assume current machine cant change *while archivebox is actively running on it*
  27. # it's not strictly impossible to swap hardware while code is running,
  28. # but its rare and unusual so we check only once per week
  29. # (e.g. VMWare can live-migrate a VM to a new host while it's running)
  30. return _CURRENT_MACHINE
  31. else:
  32. _CURRENT_MACHINE = None
  33. _CURRENT_MACHINE, _created = self.update_or_create(
  34. guid=get_host_guid(),
  35. defaults={
  36. 'hostname': socket.gethostname(),
  37. **get_os_info(),
  38. **get_vm_info(),
  39. 'stats': get_host_stats(),
  40. },
  41. )
  42. _CURRENT_MACHINE.save() # populate ABID
  43. return _CURRENT_MACHINE
  44. class Machine(ABIDModel, ModelWithHealthStats):
  45. """Audit log entry for a physical machine that was used to do archiving."""
  46. abid_prefix = 'mxn_'
  47. abid_ts_src = 'self.created_at'
  48. abid_uri_src = 'self.guid'
  49. abid_subtype_src = '"01"'
  50. abid_rand_src = 'self.id'
  51. abid_drift_allowed = False
  52. id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
  53. abid = ABIDField(prefix=abid_prefix)
  54. created_at = AutoDateTimeField(default=None, null=False, db_index=True)
  55. modified_at = models.DateTimeField(auto_now=True)
  56. # IMMUTABLE PROPERTIES
  57. guid = models.CharField(max_length=64, default=None, null=False, unique=True, editable=False) # 64char sha256 hash of machine's unique hardware ID
  58. # MUTABLE PROPERTIES
  59. hostname = models.CharField(max_length=63, default=None, null=False) # e.g. somehost.subdomain.example.com
  60. hw_in_docker = models.BooleanField(default=False, null=False) # e.g. False
  61. hw_in_vm = models.BooleanField(default=False, null=False) # e.g. False
  62. hw_manufacturer = models.CharField(max_length=63, default=None, null=False) # e.g. Apple
  63. hw_product = models.CharField(max_length=63, default=None, null=False) # e.g. Mac Studio Mac13,1
  64. hw_uuid = models.CharField(max_length=255, default=None, null=False) # e.g. 39A12B50-...-...-...-...
  65. os_arch = models.CharField(max_length=15, default=None, null=False) # e.g. arm64
  66. os_family = models.CharField(max_length=15, default=None, null=False) # e.g. darwin
  67. os_platform = models.CharField(max_length=63, default=None, null=False) # e.g. macOS-14.6.1-arm64-arm-64bit
  68. os_release = models.CharField(max_length=63, default=None, null=False) # e.g. macOS 14.6.1
  69. os_kernel = models.CharField(max_length=255, default=None, null=False) # e.g. Darwin Kernel Version 23.6.0: Mon Jul 29 21:14:30 PDT 2024; root:xnu-10063.141.2~1/RELEASE_ARM64_T6000
  70. # STATS COUNTERS
  71. stats = models.JSONField(default=dict, null=False) # e.g. {"cpu_load": [1.25, 2.4, 1.4], "mem_swap_used_pct": 56, ...}
  72. # num_uses_failed = models.PositiveIntegerField(default=0) # from ModelWithHealthStats
  73. # num_uses_succeeded = models.PositiveIntegerField(default=0)
  74. objects: MachineManager = MachineManager()
  75. networkinterface_set: models.Manager['NetworkInterface']
  76. class NetworkInterfaceManager(models.Manager):
  77. def current(self) -> 'NetworkInterface':
  78. """Get the current network interface for the current machine."""
  79. global _CURRENT_INTERFACE
  80. if _CURRENT_INTERFACE:
  81. # assume the current network interface (public IP, DNS servers, etc.) wont change more than once per hour
  82. expires_at = _CURRENT_INTERFACE.modified_at + timedelta(seconds=NETWORK_INTERFACE_RECHECK_INTERVAL)
  83. if timezone.now() < expires_at:
  84. return _CURRENT_INTERFACE
  85. else:
  86. _CURRENT_INTERFACE = None
  87. machine = Machine.objects.current()
  88. net_info = get_host_network()
  89. _CURRENT_INTERFACE, _created = self.update_or_create(
  90. machine=machine,
  91. ip_public=net_info.pop('ip_public'),
  92. ip_local=net_info.pop('ip_local'),
  93. mac_address=net_info.pop('mac_address'),
  94. dns_server=net_info.pop('dns_server'),
  95. defaults=net_info,
  96. )
  97. _CURRENT_INTERFACE.save() # populate ABID
  98. return _CURRENT_INTERFACE
  99. class NetworkInterface(ABIDModel, ModelWithHealthStats):
  100. """Audit log entry for a physical network interface / internet connection that was used to do archiving."""
  101. abid_prefix = 'ixf_'
  102. abid_ts_src = 'self.machine.created_at'
  103. abid_uri_src = 'self.machine.guid'
  104. abid_subtype_src = 'self.iface'
  105. abid_rand_src = 'self.id'
  106. abid_drift_allowed = False
  107. id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
  108. abid = ABIDField(prefix=abid_prefix)
  109. created_at = AutoDateTimeField(default=None, null=False, db_index=True)
  110. modified_at = models.DateTimeField(auto_now=True)
  111. machine = models.ForeignKey(Machine, on_delete=models.CASCADE, default=None, null=False) # e.g. Machine(id=...)
  112. # IMMUTABLE PROPERTIES
  113. mac_address = models.CharField(max_length=17, default=None, null=False, editable=False) # e.g. ab:cd:ef:12:34:56
  114. ip_public = models.GenericIPAddressField(default=None, null=False, editable=False) # e.g. 123.123.123.123 or 2001:0db8:85a3:0000:0000:8a2e:0370:7334
  115. ip_local = models.GenericIPAddressField(default=None, null=False, editable=False) # e.g. 192.168.2.18 or 2001:0db8:85a3:0000:0000:8a2e:0370:7334
  116. dns_server = models.GenericIPAddressField(default=None, null=False, editable=False) # e.g. 8.8.8.8 or 2001:0db8:85a3:0000:0000:8a2e:0370:7334
  117. # MUTABLE PROPERTIES
  118. hostname = models.CharField(max_length=63, default=None, null=False) # e.g. somehost.sub.example.com
  119. iface = models.CharField(max_length=15, default=None, null=False) # e.g. en0
  120. isp = models.CharField(max_length=63, default=None, null=False) # e.g. AS-SONICTELECOM
  121. city = models.CharField(max_length=63, default=None, null=False) # e.g. Berkeley
  122. region = models.CharField(max_length=63, default=None, null=False) # e.g. California
  123. country = models.CharField(max_length=63, default=None, null=False) # e.g. United States
  124. # STATS COUNTERS (inherited from ModelWithHealthStats)
  125. # num_uses_failed = models.PositiveIntegerField(default=0)
  126. # num_uses_succeeded = models.PositiveIntegerField(default=0)
  127. objects: NetworkInterfaceManager = NetworkInterfaceManager()
  128. class Meta:
  129. unique_together = (
  130. # if *any* of these change, it's considered a different interface
  131. # because we might get different downloaded content as a result,
  132. # this forces us to store an audit trail whenever these things change
  133. ('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server'),
  134. )
  135. class InstalledBinaryManager(models.Manager):
  136. def get_from_db_or_cache(self, binary: Binary) -> 'InstalledBinary':
  137. """Get or create an InstalledBinary record for a Binary on the local machine"""
  138. global _CURRENT_BINARIES
  139. cached_binary = _CURRENT_BINARIES.get(binary.name)
  140. if cached_binary:
  141. expires_at = cached_binary.modified_at + timedelta(seconds=INSTALLED_BINARY_RECHECK_INTERVAL)
  142. if timezone.now() < expires_at:
  143. is_loaded = binary.abspath and binary.version and binary.sha256
  144. if is_loaded:
  145. # if the caller took did the (expensive) job of loading the binary from the filesystem already
  146. # then their in-memory version is certainly more up-to-date than any potential cached version
  147. # use this opportunity to invalidate the cache in case if anything has changed
  148. is_different_from_cache = (
  149. binary.abspath != cached_binary.abspath
  150. or binary.version != cached_binary.version
  151. or binary.sha256 != cached_binary.sha256
  152. )
  153. if is_different_from_cache:
  154. _CURRENT_BINARIES.pop(binary.name)
  155. else:
  156. return cached_binary
  157. else:
  158. # if they have not yet loaded the binary
  159. # but our cache is recent enough and not expired, assume cached version is good enough
  160. # it will automatically reload when the cache expires
  161. # cached_binary will be stale/bad for up to 30min if binary was updated/removed on host system
  162. return cached_binary
  163. else:
  164. # cached binary is too old, reload it from scratch
  165. _CURRENT_BINARIES.pop(binary.name)
  166. if not binary.abspath or not binary.version or not binary.sha256:
  167. # if binary was not yet loaded from filesystem, do it now
  168. # this is expensive, we have to find it's abspath, version, and sha256, but it's necessary
  169. # to make sure we have a good, up-to-date record of it in the DB & in-memroy cache
  170. binary = archivebox.pm.hook.binary_load(binary=binary, fresh=True)
  171. assert binary.loaded_binprovider and binary.loaded_abspath and binary.loaded_version and binary.loaded_sha256, f'Failed to load binary {binary.name} abspath, version, and sha256'
  172. _CURRENT_BINARIES[binary.name], _created = self.update_or_create(
  173. machine=Machine.objects.current(),
  174. name=binary.name,
  175. binprovider=binary.loaded_binprovider.name,
  176. version=str(binary.loaded_version),
  177. abspath=str(binary.loaded_abspath),
  178. sha256=str(binary.loaded_sha256),
  179. )
  180. cached_binary = _CURRENT_BINARIES[binary.name]
  181. cached_binary.save() # populate ABID
  182. # if we get this far make sure DB record matches in-memroy cache
  183. assert str(cached_binary.binprovider) == str(binary.loaded_binprovider.name)
  184. assert str(cached_binary.abspath) == str(binary.loaded_abspath)
  185. assert str(cached_binary.version) == str(binary.loaded_version)
  186. assert str(cached_binary.sha256) == str(binary.loaded_sha256)
  187. return cached_binary
  188. class InstalledBinary(ABIDModel, ModelWithHealthStats):
  189. abid_prefix = 'bin_'
  190. abid_ts_src = 'self.machine.created_at'
  191. abid_uri_src = 'self.machine.guid'
  192. abid_subtype_src = 'self.binprovider'
  193. abid_rand_src = 'self.id'
  194. abid_drift_allowed = False
  195. id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
  196. abid = ABIDField(prefix=abid_prefix)
  197. created_at = AutoDateTimeField(default=None, null=False, db_index=True)
  198. modified_at = models.DateTimeField(auto_now=True)
  199. # IMMUTABLE PROPERTIES
  200. machine = models.ForeignKey(Machine, on_delete=models.CASCADE, default=None, null=False, blank=True)
  201. name = models.CharField(max_length=63, default=None, null=False, blank=True)
  202. binprovider = models.CharField(max_length=31, default=None, null=False, blank=True)
  203. abspath = models.CharField(max_length=255, default=None, null=False, blank=True)
  204. version = models.CharField(max_length=32, default=None, null=False, blank=True)
  205. sha256 = models.CharField(max_length=64, default=None, null=False, blank=True)
  206. # MUTABLE PROPERTIES (TODO)
  207. # is_pinned = models.BooleanField(default=False) # i.e. should this binary superceede other binaries with the same name on the host?
  208. # is_valid = models.BooleanField(default=True) # i.e. is this binary still available on the host?
  209. # STATS COUNTERS (inherited from ModelWithHealthStats)
  210. # num_uses_failed = models.PositiveIntegerField(default=0)
  211. # num_uses_succeeded = models.PositiveIntegerField(default=0)
  212. objects: InstalledBinaryManager = InstalledBinaryManager()
  213. class Meta:
  214. verbose_name = 'Installed Binary'
  215. verbose_name_plural = 'Installed Binaries'
  216. unique_together = (
  217. ('machine', 'name', 'binprovider', 'abspath', 'version', 'sha256'),
  218. )
  219. def __str__(self) -> str:
  220. return f'{self.name}@{self.binprovider}+{self.abspath}@{self.version}'
  221. def clean(self, *args, **kwargs) -> None:
  222. assert self.name or self.abspath
  223. self.name = str(self.name or self.abspath)
  224. assert self.name
  225. if not hasattr(self, 'machine'):
  226. self.machine = Machine.objects.current()
  227. if not self.binprovider:
  228. all_known_binproviders = list(abx.as_dict(archivebox.pm.hook.get_BINPROVIDERS()).values())
  229. binary = archivebox.pm.hook.binary_load(binary=Binary(name=self.name, binproviders=all_known_binproviders), fresh=True)
  230. self.binprovider = binary.loaded_binprovider.name if binary.loaded_binprovider else None
  231. if not self.abspath:
  232. self.abspath = self.BINPROVIDER.get_abspath(self.name)
  233. if not self.version:
  234. self.version = self.BINPROVIDER.get_version(self.name, abspath=self.abspath)
  235. if not self.sha256:
  236. self.sha256 = self.BINPROVIDER.get_sha256(self.name, abspath=self.abspath)
  237. super().clean(*args, **kwargs)
  238. @cached_property
  239. def BINARY(self) -> Binary:
  240. for binary in abx.as_dict(archivebox.pm.hook.get_BINARIES()).values():
  241. if binary.name == self.name:
  242. return binary
  243. raise Exception(f'Orphaned InstalledBinary {self.name} {self.binprovider} was found in DB, could not find any plugin that defines it')
  244. # TODO: we could technically reconstruct it from scratch, but why would we ever want to do that?
  245. @cached_property
  246. def BINPROVIDER(self) -> BinProvider:
  247. for binprovider in abx.as_dict(archivebox.pm.hook.get_BINPROVIDERS()).values():
  248. if binprovider.name == self.binprovider:
  249. return binprovider
  250. raise Exception(f'Orphaned InstalledBinary(name={self.name}) was found in DB, could not find any plugin that defines BinProvider(name={self.binprovider})')
  251. # maybe not a good idea to provide this? Binary in DB is a record of the binary's config
  252. # whereas a loaded binary is a not-yet saved instance that may not have the same config
  253. # why would we want to load a binary record from the db when it could be freshly loaded?
  254. def load_from_db(self) -> Binary:
  255. # TODO: implement defaults arg in pydantic_pkgr
  256. # return self.BINARY.load(defaults={
  257. # 'binprovider': self.BINPROVIDER,
  258. # 'abspath': Path(self.abspath),
  259. # 'version': self.version,
  260. # 'sha256': self.sha256,
  261. # })
  262. return Binary.model_validate({
  263. **self.BINARY.model_dump(),
  264. 'abspath': self.abspath and Path(self.abspath),
  265. 'version': self.version,
  266. 'sha256': self.sha256,
  267. 'loaded_binprovider': self.BINPROVIDER,
  268. 'binproviders_supported': self.BINARY.binproviders_supported,
  269. 'overrides': self.BINARY.overrides,
  270. })
  271. def load_fresh(self) -> Binary:
  272. return archivebox.pm.hook.binary_load(binary=self.BINARY, fresh=True)