models.py 39 KB


  1. """
  2. This file provides the Django ABIDField and ABIDModel base model to inherit from.
  3. """
  4. import io
  5. import csv
  6. import json
  7. from typing import Any, Dict, Union, List, Set, cast, ClassVar, Iterable
  8. import json
  9. from uuid import uuid4
  10. from functools import partial
  11. from pathlib import Path
  12. from charidfield import CharIDField # type: ignore[import-untyped]
  13. from django.contrib import admin
  14. from django.core import checks
  15. from django.core.exceptions import ValidationError, NON_FIELD_ERRORS
  16. from django.db import models
  17. from django.utils import timezone
  18. from django.utils.functional import classproperty
  19. from django.db.utils import OperationalError
  20. from django.contrib.auth import get_user_model
  21. from django.urls import reverse_lazy
  22. from django.conf import settings
  23. # from django.contrib.contenttypes.models import ContentType
  24. # from django.contrib.contenttypes.fields import GenericForeignKey
  25. # from django.contrib.contenttypes.fields import GenericRelation
  26. from django_stubs_ext.db.models import TypedModelMeta
  27. from tags.models import KVTag, ModelWithKVTags
  28. from archivebox import DATA_DIR
  29. from archivebox.index.json import to_json
  30. from archivebox.misc.hashing import get_dir_info
  31. from .abid import (
  32. ABID,
  33. ABID_LEN,
  34. ABID_RAND_LEN,
  35. ABID_SUFFIX_LEN,
  36. DEFAULT_ABID_PREFIX,
  37. DEFAULT_ABID_URI_SALT,
  38. abid_part_from_prefix,
  39. abid_hashes_from_values,
  40. ts_from_abid,
  41. abid_part_from_ts,
  42. )
  43. ####################################################
  44. DEFAULT_ICON = '<img src="" alt="Icon"/>'
  45. # Database Field for typeid/ulid style IDs with a prefix, e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ
  46. ABIDField = partial(
  47. CharIDField,
  48. max_length=ABID_LEN,
  49. help_text="ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)",
  50. default=None,
  51. null=True,
  52. blank=True,
  53. db_index=True,
  54. unique=True,
  55. )
  56. def get_or_create_system_user_pk(username='system'):
  57. """Get or create a system user with is_superuser=True to be the default owner for new DB rows"""
  58. User = get_user_model()
  59. # if only one user exists total, return that user
  60. if User.objects.filter(is_superuser=True).count() == 1:
  61. return User.objects.filter(is_superuser=True).values_list('pk', flat=True)[0]
  62. # otherwise, create a dedicated "system" user
  63. user, _was_created = User.objects.get_or_create(username=username, is_staff=True, is_superuser=True, defaults={'email': '', 'password': ''})
  64. return user.pk
  65. class AutoDateTimeField(models.DateTimeField):
  66. # def pre_save(self, model_instance, add):
  67. # return timezone.now()
  68. pass
  69. class ABIDError(Exception):
  70. pass
  71. class ModelWithReadOnlyFields(models.Model):
  72. """
  73. Base class for models that have some read-only fields enforced by .save().
  74. """
  75. read_only_fields: ClassVar[tuple[str, ...]] = ()
  76. class Meta:
  77. abstract = True
  78. def _fresh_from_db(self):
  79. try:
  80. return self.objects.get(pk=self.pk)
  81. except self.__class__.DoesNotExist:
  82. return None
  83. def diff_from_db(self, keys: Iterable[str]=()) -> dict[str, tuple[Any, Any]]:
  84. """Get a dictionary of the fields that have changed from the values in the database"""
  85. keys = keys or [field.name for field in self._meta.get_fields()]
  86. if not keys:
  87. return {}
  88. in_db = self._fresh_from_db()
  89. if not in_db:
  90. return {}
  91. diff = {}
  92. for field in keys:
  93. new_value = getattr(self, field, None)
  94. existing_value = getattr(in_db, field, None)
  95. if new_value != existing_value:
  96. diff[field] = (existing_value, new_value)
  97. return diff
  98. def save(self, *args, **kwargs) -> None:
  99. diff = self.diff_from_db(keys=self.read_only_fields)
  100. if diff:
  101. changed_key = next(iter(diff.keys()))
  102. existing_value, new_value = diff[changed_key]
  103. raise AttributeError(f'{self}.{changed_key} is read-only and cannot be changed from {existing_value} -> {new_value}')
  104. super().save(*args, **kwargs)
  105. class ModelWithUUID(ModelWithReadOnlyFields, ModelWithKVTags):
  106. read_only_fields = ('id', 'created_at')
  107. id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
  108. created_at = AutoDateTimeField(default=None, null=False, db_index=True)
  109. class Meta(TypedModelMeta):
  110. abstract = True
  111. default_json_keys: ClassVar[tuple[str, ...]] = (
  112. 'TYPE',
  113. 'id',
  114. 'abid',
  115. 'str',
  116. 'modified_at',
  117. 'created_at',
  118. 'created_by_id',
  119. 'status',
  120. 'retry_at',
  121. 'notes',
  122. )
  123. @classmethod
  124. def from_dict(cls, fields: dict[str, Any]) -> Self:
  125. init_kwargs = {k: v for k, v in fields.items() if hasattr(cls, k)}
  126. return cls(**init_kwargs)
  127. def update(self, **kwargs) -> None:
  128. """Update the object's properties from a dict"""
  129. for key, value in kwargs.items():
  130. setattr(self, key, value)
  131. self.save()
  132. def as_json(self, keys: Iterable[str]=()) -> dict:
  133. """Get the object's properties as a dict"""
  134. return benedict({
  135. key: getattr(self, key)
  136. for key in (keys or self.default_json_keys)
  137. if hasattr(self, key)
  138. })
  139. @classproperty
  140. def TYPE(cls) -> str:
  141. """Get the full Python dotted-import path for this model, e.g. 'core.models.Snapshot'"""
  142. return f'{cls.__module__}.{cls.__name__}'
  143. @property
  144. def admin_change_url(self) -> str:
  145. """get the admin URL e.g. /admin/core/snapshot/abcd-1234-1234-asdfjkl23jsdf4/change/"""
  146. return f"/admin/{self._meta.app_label}/{self._meta.model_name}/{self.pk}/change/"
  147. class ModelWithSerializers(ModelWithUUID):
  148. def as_csv_row(self, keys: Iterable[str]=(), separator: str=',') -> str:
  149. """Get the object's properties as a csv string"""
  150. keys = keys or self.as_json().keys()
  151. # return separator.join(
  152. # str(getattr(self, key, ''))
  153. # for key in keys
  154. # )
  155. # use real csv lib instead:
  156. buffer = io.StringIO()
  157. csv_writer = csv.writer(buffer, delimiter=separator)
  158. csv_writer.writerow(
  159. str(getattr(self, key, ''))
  160. for key in keys
  161. )
  162. return buffer.getvalue()
  163. def as_jsonl_row(self, keys: Iterable[str]=(), **json_kwargs) -> str:
  164. """Get the object's properties as a jsonl string"""
  165. keys = keys or self.as_json().keys()
  166. return json.dumps({
  167. key: getattr(self, key, '')
  168. for key in keys
  169. }, **{'sort_keys': True, 'indent': None, **json_kwargs})
  170. def as_html_icon(self) -> str:
  171. """Get a representation of this object as a simple html <img> tag or emoji"""
  172. # render snapshot_detail.html template with self as context and return html string
  173. return DEFAULT_ICON
  174. def as_html_row(self) -> str:
  175. """Get a representation of this object as a static html table <tr>...</tr> string"""
  176. # render snapshot_detail.html template with self as context and return html string
  177. # TODO: replace with a real django template
  178. return f'<tr><td>{self.as_html_icon()}</td><td>{self.as_csv_row()}</td></tr>'
  179. def as_html_embed(self) -> str:
  180. """Get a representation of this object suitable for embedding inside a roughly 400x300px iframe"""
  181. # render snapshot_detail.html template with self as context and return html string
  182. # TODO: replace with a real django template
  183. return f'{self.as_html_row()}'
  184. def as_html_fullpage(self) -> str:
  185. """Get a static html page representation of this object"""
  186. # TODO: replace with a real django template
  187. return f'''
  188. <html>
  189. <head>
  190. <title>{self}</title>
  191. </head>
  192. <body>
  193. <header>
  194. <h1>{self}</h1>
  195. <pre>{self.as_jsonl_row()}</pre>
  196. </header>
  197. <hr/>
  198. <article>
  199. {self.as_html_embed()}
  200. </article>
  201. </body>
  202. </html>
  203. '''
  204. class ABIDModel(ModelWithReadOnlyFields, ModelWithUUID):
  205. """
  206. Abstract Base Model for other models to depend on. Provides ArchiveBox ID (ABID) interface and other helper methods.
  207. """
  208. abid_prefix: str = DEFAULT_ABID_PREFIX # e.g. 'tag_'
  209. abid_ts_src = 'self.created_at' # e.g. 'self.created_at'
  210. abid_uri_src = 'None' # e.g. 'self.uri' (MUST BE SET)
  211. abid_subtype_src = 'self.__class__.__name__' # e.g. 'self.extractor'
  212. abid_rand_src = 'self.id' # e.g. 'self.uuid' or 'self.id'
  213. abid_drift_allowed: bool = False # set to True to allow abid_field values to change after a fixed ABID has been issued (NOT RECOMMENDED: means values can drift out of sync from original ABID)
  214. abid_salt: str = DEFAULT_ABID_URI_SALT # combined with self.uri to anonymize hashes on a per-install basis (default is shared globally with all users, means everyone will hash ABC to -> 123 the same around the world, makes it easy to share ABIDs across installs and see if they are for the same URI. Change this if you dont want your hashes to be guessable / in the same hash space as all other users)
  215. # **all abid_*_src fields listed above should be in read_only_fields!
  216. read_only_fields = ('id', 'abid', 'created_at', 'created_by')
  217. id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
  218. abid = ABIDField(prefix=abid_prefix)
  219. created_at = AutoDateTimeField(default=None, null=False, db_index=True)
  220. created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, db_index=True)
  221. modified_at = models.DateTimeField(auto_now=True)
  222. _prefetched_objects_cache: Dict[str, Any]
  223. class Meta(TypedModelMeta):
  224. abstract = True
  225. @admin.display(description='Summary')
  226. def __str__(self) -> str:
  227. return f'[{self.abid or (self.abid_prefix + "NEW")}] {self.__class__.__name__} {eval(self.abid_uri_src)}'
  228. def __init__(self, *args: Any, **kwargs: Any) -> None:
  229. """Overriden __init__ method ensures we have a stable creation timestamp that fields can use within initialization code pre-saving to DB."""
  230. super().__init__(*args, **kwargs) # type: ignore
  231. # pre-compute a stable timestamp of the obj init time (with abid.ts precision limit applied) for use when object is first created,
  232. # some other fields depend on a timestamp at creation time, and it's nice to have one common timestamp they can all share.
  233. # Used as an alternative to auto_now_add=True + auto_now=True which can produce two different times & requires saving to DB to get the TS.
  234. # (ordinarily fields cant depend on other fields until the obj is saved to db and recalled)
  235. self._init_timestamp = ts_from_abid(abid_part_from_ts(timezone.now()))
  236. @classmethod
  237. def check(cls, **kwargs):
  238. errors = super().check(**kwargs)
  239. try:
  240. assert hasattr(cls, 'id'), f'{cls.__name__}: All ABIDModel subclasses must define an id field'
  241. assert hasattr(cls, 'abid'), f'{cls.__name__}: All ABIDModel subclasses must define an abid field'
  242. assert hasattr(cls, 'created_at'), f'{cls.__name__}: All ABIDModel subclasses must define a created_at field'
  243. assert hasattr(cls, 'modified_at'), f'{cls.__name__}: All ABIDModel subclasses must define a modified_at field'
  244. assert hasattr(cls, 'created_by'), f'{cls.__name__}: All ABIDModel subclasses must define a created_by field'
  245. except AssertionError as e:
  246. errors.append(checks.Error(
  247. str(e),
  248. # hint='...',
  249. obj=cls,
  250. id=f"{cls.__module__}.{cls.__name__}.E001",
  251. ))
  252. return errors
  253. def clean(self, abid_drift_allowed: bool | None=None) -> None:
  254. # TODO: ideally issuing new ABIDs should be farmed out to a separate service that makes sure they're all unique and monotonic
  255. # but for now this works and is much faster, we just calculate ABID on first save, and warn if updating any fields would ever invalidate it
  256. if self._state.adding:
  257. # only runs once when a new object is first saved to the DB
  258. # sets self.id, self.pk, self.created_by, self.created_at, self.modified_at
  259. self._previous_abid = None
  260. self.abid = str(self.issue_new_abid())
  261. else:
  262. # otherwise if updating, make sure none of the field changes would invalidate existing ABID
  263. abid_diffs = self.ABID_FRESH_DIFFS
  264. if abid_diffs:
  265. # change has invalidated the existing ABID, raise a nice ValidationError pointing out which fields caused the issue
  266. keys_changed = ', '.join(diff['abid_src'] for diff in abid_diffs.values())
  267. full_summary = (
  268. f"This {self.__class__.__name__}(abid={str(self.ABID)}) was assigned a fixed, unique ID (ABID) based on its contents when it was created. " +
  269. f"\nYou must reduce your changes to not affect these fields [{keys_changed}], or create a new {self.__class__.__name__} object instead."
  270. )
  271. change_error = ValidationError({
  272. **{
  273. # url: ValidationError('Cannot update self.url= https://example.com/old -> https://example.com/new ...')
  274. diff['abid_src'].replace('self.', '')
  275. if (diff['old_val'] != diff['new_val']) and hasattr(self, diff['abid_src'].replace('self.', ''))
  276. else NON_FIELD_ERRORS
  277. : ValidationError(
  278. 'Cannot update %(abid_src)s= "%(old_val)s" -> "%(new_val)s" (would alter %(model)s.ABID.%(key)s=%(old_hash)s to %(new_hash)s)',
  279. code='ABIDConflict',
  280. params=diff,
  281. )
  282. for diff in abid_diffs.values()
  283. },
  284. NON_FIELD_ERRORS: ValidationError(full_summary),
  285. })
  286. allowed_to_invalidate_abid = self.abid_drift_allowed if (abid_drift_allowed is None) else abid_drift_allowed
  287. if allowed_to_invalidate_abid:
  288. # print(f'\n#### WARNING: Change allowed despite it invalidating the ABID of an existing record ({self.__class__.__name__}.abid_drift_allowed={self.abid_drift_allowed})!', self.abid)
  289. # print(change_error)
  290. # print('--------------------------------------------------------------------------------------------------')
  291. pass
  292. else:
  293. print(f'\n#### ERROR: Change blocked because it would invalidate ABID of an existing record ({self.__class__.__name__}.abid_drift_allowed={self.abid_drift_allowed})', self.abid)
  294. print(change_error)
  295. print('--------------------------------------------------------------------------------------------------')
  296. raise change_error
  297. def save(self, *args: Any, abid_drift_allowed: bool | None=None, **kwargs: Any) -> None:
  298. """Overriden save method ensures new ABID is generated while a new object is first saving."""
  299. self.clean(abid_drift_allowed=abid_drift_allowed)
  300. return super().save(*args, **kwargs)
  301. @classmethod
  302. def id_from_abid(cls, abid: str) -> str:
  303. return str(cls.objects.only('pk').get(abid=cls.abid_prefix + str(abid).split('_', 1)[-1]).pk)
  304. @property
  305. def ABID_SOURCES(self) -> Dict[str, str]:
  306. """"Get the dict of fresh ABID component values based on the live object's properties."""
  307. assert self.abid_prefix
  308. return {
  309. 'prefix': 'self.abid_prefix', # defined as static class vars at build time
  310. 'ts': self.abid_ts_src,
  311. 'uri': self.abid_uri_src,
  312. 'subtype': self.abid_subtype_src,
  313. 'rand': self.abid_rand_src,
  314. 'salt': 'self.abid_salt', # defined as static class vars at build time
  315. }
  316. @property
  317. def ABID_FRESH_VALUES(self) -> Dict[str, Any]:
  318. """"Get the dict of fresh ABID component values based on the live object's properties."""
  319. abid_sources = self.ABID_SOURCES
  320. assert all(src != 'None' for src in abid_sources.values())
  321. return {
  322. 'prefix': eval(abid_sources['prefix']),
  323. 'ts': eval(abid_sources['ts']),
  324. 'uri': eval(abid_sources['uri']),
  325. 'subtype': eval(abid_sources['subtype']),
  326. 'rand': eval(abid_sources['rand']),
  327. 'salt': eval(abid_sources['salt']),
  328. }
  329. @property
  330. def ABID_FRESH_HASHES(self) -> Dict[str, str]:
  331. """"Get the dict of fresh ABID component hashes based on the live object's properties."""
  332. abid_values = self.ABID_FRESH_VALUES
  333. assert all(val for val in abid_values.values())
  334. return abid_hashes_from_values(
  335. prefix=abid_values['prefix'],
  336. ts=abid_values['ts'],
  337. uri=abid_values['uri'],
  338. subtype=abid_values['subtype'],
  339. rand=abid_values['rand'],
  340. salt=abid_values['salt'],
  341. )
  342. @property
  343. def ABID_FRESH_DIFFS(self) -> Dict[str, Dict[str, Any]]:
  344. """Get the dict of discrepancies between the existing saved ABID and a new fresh ABID computed based on the live object."""
  345. existing_abid = self.ABID
  346. existing_values = {} if self._state.adding else self.__class__.objects.get(pk=self.pk).ABID_FRESH_VALUES
  347. abid_sources = self.ABID_SOURCES
  348. fresh_values = self.ABID_FRESH_VALUES
  349. fresh_hashes = self.ABID_FRESH_HASHES
  350. return {
  351. key: {
  352. 'key': key,
  353. 'model': self.__class__.__name__,
  354. 'pk': self.pk,
  355. 'abid_src': abid_sources[key],
  356. 'old_val': existing_values.get(key, None),
  357. 'old_hash': getattr(existing_abid, key),
  358. 'new_val': fresh_values[key],
  359. 'new_hash': new_hash,
  360. 'summary': f'{abid_sources[key]}= "{existing_values.get(key, None)}" -> "{fresh_values[key]}" (would alter {self.__class__.__name__.lower()}.ABID.{key}={getattr(existing_abid, key)} to {new_hash})',
  361. }
  362. for key, new_hash in fresh_hashes.items()
  363. if getattr(existing_abid, key) != new_hash
  364. }
  365. def issue_new_abid(self, overwrite=False) -> ABID:
  366. """
  367. Issue a new ABID based on the current object's properties, can only be called once on new objects (before they are saved to DB).
  368. TODO: eventually we should move this to a separate service that makes sure they're all unique and monotonic
  369. perhaps it could be moved to a KVTag as well, and we could just use the KVTag service + Events to issue new ABIDs
  370. """
  371. if not overwrite:
  372. assert self._state.adding, 'Can only issue new ABID when model._state.adding is True'
  373. assert eval(self.abid_uri_src), f'Can only issue new ABID if self.abid_uri_src is defined ({self.abid_uri_src}={eval(self.abid_uri_src)})'
  374. # Setup Field defaults to be ready for ABID generation
  375. self.abid = None
  376. self.id = self.id or uuid4()
  377. self.pk = self.id
  378. self.created_at = self.created_at or self._init_timestamp # cut off precision to match precision of TS component
  379. self.modified_at = self.modified_at or self.created_at
  380. self.created_by_id = getattr(self, 'created_by_id', None) or get_or_create_system_user_pk()
  381. # Compute fresh ABID values & hashes based on object's live properties
  382. abid_fresh_values = self.ABID_FRESH_VALUES
  383. assert all(abid_fresh_values.values()), f'All ABID_FRESH_VALUES must be set {abid_fresh_values}'
  384. abid_fresh_hashes = self.ABID_FRESH_HASHES
  385. assert all(abid_fresh_hashes.values()), f'All ABID_FRESH_HASHES must be able to be generated {abid_fresh_hashes}'
  386. new_abid = ABID(**abid_fresh_hashes)
  387. assert new_abid.ulid and new_abid.uuid and new_abid.typeid, f'Failed to calculate {abid_fresh_values["prefix"]}_ABID for {self.__class__.__name__}'
  388. return new_abid
  389. @property
  390. def ABID(self) -> ABID:
  391. """
  392. Get the object's existing ABID (from self.abid if it's already saved to DB, otherwise generated fresh)
  393. e.g. -> ABID(ts='01HX9FPYTR', uri='E4A5CCD9', subtype='00', rand='ZYEBQE')
  394. """
  395. if self.abid:
  396. return ABID.parse(cast(str, self.abid))
  397. return self.issue_new_abid()
  398. # These are all example helpers to make it easy to access alternate formats of the ABID.*, only add them if you actually need them
  399. # @property
  400. # def UUID(self) -> UUID:
  401. # """
  402. # Get a uuid.UUID (v4) representation of the object's ABID.
  403. # """
  404. # return self.ABID.uuid
  405. # @property
  406. # def uuid(self) -> str:
  407. # """
  408. # Get a str uuid.UUID (v4) representation of the object's ABID.
  409. # """
  410. # return str(self.ABID.uuid)
  411. # @property
  412. # def ULID(self) -> ULID:
  413. # """
  414. # Get a ulid.ULID representation of the object's ABID.
  415. # """
  416. # return self.ABID.ulid
  417. # @property
  418. # def TypeID(self) -> TypeID:
  419. # """
  420. # Get a typeid.TypeID (stripe-style) representation of the object's ABID.
  421. # """
  422. # return self.ABID.typeid
  423. @property
  424. def api_url(self) -> str:
  425. """
  426. Compute the REST API URL to access this object.
  427. e.g. /api/v1/core/snapshot/snp_01BJQMF54D093DXEAWZ6JYRP
  428. """
  429. return reverse_lazy('api-1:get_any', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}'
  430. @property
  431. def api_docs_url(self) -> str:
  432. """
  433. Compute the REST API Documentation URL to learn about accessing this object.
  434. e.g. /api/v1/docs#/Core%20Models/api_v1_core_get_snapshots
  435. """
  436. return f'/api/v1/docs#/{self._meta.app_label.title()}%20Models/api_v1_{self._meta.app_label}_get_{self._meta.db_table}'
  437. # class ModelWithStateMachine(models.Model):
  438. # ... see workers/models.py ...
  439. # retry_at = models.DateTimeField(default=None, null=True, db_index=True)
  440. # status = models.CharField(max_length=16, choices=StatusChoices.choices, default=StatusChoices.QUEUED)
  441. class ModelWithNotes(models.Model):
  442. """
  443. Very simple Model that adds a notes field to any model.
  444. """
  445. # label = models.CharField(max_length=63, blank=True, null=False, default='', help_text='A custom label for this object')
  446. notes = models.TextField(blank=True, null=False, default='', help_text='Any extra extra custom notes')
  447. class Meta:
  448. abstract = True
  449. class ModelWithHealthStats(models.Model):
  450. num_uses_failed = models.PositiveIntegerField(default=0)
  451. num_uses_succeeded = models.PositiveIntegerField(default=0)
  452. class Meta:
  453. abstract = True
  454. def increment_num_uses_failed(self) -> None:
  455. self.num_uses_failed += 1
  456. self.save()
  457. def increment_num_uses_succeeded(self) -> None:
  458. self.num_uses_succeeded += 1
  459. self.save()
  460. def reset_health_counts(self) -> None:
  461. # move all the failures to successes when resetting so we dont lose track of the total count
  462. self.num_uses_succeeded = self.num_uses_failed + self.num_uses_succeeded
  463. self.num_uses_failed = 0
  464. self.save()
  465. @property
  466. def health(self) -> int:
  467. total_uses = max((self.num_uses_failed + self.num_uses_succeeded, 1))
  468. success_pct = (self.num_uses_succeeded / total_uses) * 100
  469. return round(success_pct)
  470. class ModelWithConfig(models.Model):
  471. """
  472. Base Model that adds a config property to any ABIDModel.
  473. This config is retrieved by abx.pm.hook.get_scope_config(...) later whenever this model is used.
  474. """
  475. config = models.JSONField(default=dict, null=False, blank=False, editable=True)
  476. class Meta:
  477. abstract = True
  478. # @property
  479. # def unique_config(self) -> dict[str, Any]:
  480. # """Get the unique config that this model is adding to the default config"""
  481. # without_us = archivebox.pm.hook.get_scope_config()
  482. # with_us = archivebox.pm.hook.get_scope_config(extra_config=self.config)
  483. # return {
  484. # key: value
  485. # for key, value in with_us.items()
  486. # if key not in without_us
  487. # or without_us[key] != value
  488. # }
  489. class ModelWithOutputDir(ModelsWithSerializers, ModelWithUUID, ABIDModel):
  490. """
  491. Base Model that adds an output_dir property to any ABIDModel.
  492. It creates the directory on .save(with_indexes=True), automatically migrating any old data if needed.
  493. It then writes the indexes to the output_dir on .save(write_indexes=True).
  494. It also makes sure the output_dir is in sync with the model.
  495. """
  496. class Meta:
  497. abstract = True
  498. # output_dir = models.FilePathField(path=CONSTANTS.DATA_DIR, max_length=200, blank=True, null=True)
  499. # output_files = models.TextField(default='')
  500. # format: <sha256_hash>,<blake3_hash>,<size>,<content-type>,<path>
  501. # ...,...,123456,text/plain,index.merkle
  502. # ...,...,123456,text/html,index.html
  503. # ...,...,123456,application/json,index.json
  504. # ...,...,123456,text/html,singlefile/index.html
  505. def save(self, *args, write_indexes=False, **kwargs) -> None:
  506. super().save(*args, **kwargs)
  507. self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
  508. self.save_json_index() # always write index.json to data/snapshots/snp_2342353k2jn3j32l4324/index.json
  509. if write_indexes:
  510. self.write_indexes() # write the index.html, merkle hashes, symlinks, send indexable texts to search backend, etc.
  511. @property
  512. def output_dir_parent(self) -> str:
  513. """Get the model type parent directory name that holds this object's data e.g. 'archiveresults'"""
  514. parent_dir = getattr(self, 'output_dir_parent', f'{self._meta.model_name}s')
  515. assert len(parent_dir) > 2, f'output_dir_parent must be a non-empty string, got: "{parent_dir}"'
  516. return parent_dir
  517. @property
  518. def output_dir_name(self) -> str:
  519. """Get the subdirectory name for the filesystem directory that holds this object's data e.g. 'snp_2342353k2jn3j32l4324'"""
  520. assert self.ABID
  521. return str(self.ABID) # e.g. snp_2342353k2jn3j32l4324
  522. @property
  523. def output_dir_str(self) -> str:
  524. """Get relateive the filesystem directory Path that holds that data for this object e.g. 'snapshots/snp_2342353k2jn3j32l4324'"""
  525. return f'{self.output_dir_parent}/{self.output_dir_name}' # e.g. snapshots/snp_2342353k2jn3j32l4324
  526. @property
  527. def OUTPUT_DIR(self) -> Path:
  528. """Get absolute filesystem directory Path that holds that data for this object e.g. Path('/data/snapshots/snp_2342353k2jn3j32l4324')"""
  529. return DATA_DIR / self.output_dir_str # e.g. /data/snapshots/snp_2342353k2jn3j32l4324
  530. def write_indexes(self):
  531. """Write the Snapshot json, html, and merkle indexes to its output dir"""
  532. print(f'{type(self).__name__}[{self.ABID}].write_indexes()')
  533. self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
  534. # self.migrate_output_dir()
  535. self.save_merkle_index()
  536. self.save_html_index()
  537. self.save_symlinks_index()
  538. # def migrate_output_dir(self):
  539. # """Move the output files to the new folder structure if needed"""
  540. # print(f'{type(self).__name__}[{self.ABID}].migrate_output_dir()')
  541. # self.migrate_from_0_7_2()
  542. # self.migrate_from_0_8_6()
  543. # # ... future migrations here
  544. # def migrate_from_0_7_2(self) -> None:
  545. # """Migrate output_dir generated by ArchiveBox <= 0.7.2 to current version"""
  546. # print(f'{type(self).__name__}[{self.ABID}].migrate_from_0_7_2()')
  547. # # move /data/archive/<timestamp> -> /data/archive/snapshots/<abid>
  548. # # update self.output_path = /data/archive/snapshots/<abid>
  549. # pass
  550. # def migrate_from_0_8_6(self) -> None:
  551. # """Migrate output_dir generated by ArchiveBox <= 0.8.6 to current version"""
  552. # # ... future migration code here ...
  553. # print(f'{type(self).__name__}[{self.ABID}].migrate_from_0_8_6()')
  554. # pass
  555. def save_merkle_index(self, **kwargs) -> None:
  556. """Write the ./.index.merkle file to the output dir"""
  557. # write self.generate_merkle_tree() to self.output_dir / '.index.merkle'
  558. print(f'{type(self).__name__}[{self.ABID}].save_merkle_index()')
  559. dir_info = get_dir_info(self.OUTPUT_DIR, max_depth=6)
  560. with open(self.OUTPUT_DIR / '.hashes.json', 'w') as f:
  561. json.dump(dir_info, f)
  562. pass
  563. def save_html_index(self, **kwargs) -> None:
  564. # write self.as_html() to self.output_dir / 'index.html'
  565. print(f'{type(self).__name__}[{self.ABID}].save_html_index()')
  566. (self.OUTPUT_DIR / 'index.html').write_text(self.as_html())
  567. def save_json_index(self, **kwargs) -> None:
  568. """Save a JSON dump of the object to the output dir"""
  569. print(f'{type(self).__name__}[{self.ABID}].save_json_index()')
  570. # write self.as_json() to self.output_dir / 'index.json'
  571. (self.OUTPUT_DIR / 'index.json').write_text(to_json(self.as_json()))
  572. def save_symlinks_index(self) -> None:
  573. """Set up the symlink farm pointing to this object's data"""
  574. print(f'{type(self).__name__}[{self.ABID}].save_symlinks_index()')
  575. # ln -s ../../../../self.output_dir data/index/snapshots_by_date/2024-01-01/example.com/<abid>
  576. # ln -s ../../../../self.output_dir data/index/snapshots_by_domain/example.com/2024-01-01/<abid>
  577. # ln -s self.output_dir data/archive/1453452234234.21445
  578. pass
  579. def as_json(self, *keys) -> dict:
  580. """Get the object's properties as a dict"""
  581. return {
  582. 'TYPE': self.TYPE,
  583. 'id': str(self.id),
  584. 'abid': str(self.ABID),
  585. 'str': str(self),
  586. 'created_by_id': self.created_by_id,
  587. 'created_at': self.created_at,
  588. 'modified_at': self.modified_at,
  589. 'status': getattr(self, 'status', None),
  590. 'retry_at': getattr(self, 'retry_at', None),
  591. 'notes': getattr(self, 'notes', None),
  592. **{key: getattr(self, key) for key in keys},
  593. }
  594. def as_html(self) -> str:
  595. """Get the object's properties as a html string"""
  596. # render snapshot_detail.html template with self as context and return html string
  597. return str(self)
  598. ####################################################
  599. # Django helpers
  600. def find_all_abid_prefixes() -> Dict[str, type[models.Model]]:
  601. """
  602. Return the mapping of all ABID prefixes to their models.
  603. e.g. {'tag_': core.models.Tag, 'snp_': core.models.Snapshot, ...}
  604. """
  605. import django.apps
  606. prefix_map = {}
  607. for model in django.apps.apps.get_models():
  608. abid_prefix = getattr(model, 'abid_prefix', None)
  609. if abid_prefix:
  610. prefix_map[abid_prefix] = model
  611. return prefix_map
  612. def find_prefix_for_abid(abid: ABID) -> str:
  613. """
  614. Find the correct prefix for a given ABID that may have be missing a prefix (slow).
  615. e.g. ABID('obj_01BJQMF54D093DXEAWZ6JYRPAQ') -> 'snp_'
  616. """
  617. # if existing abid prefix is correct, lookup is easy
  618. model = find_model_from_abid(abid)
  619. if model:
  620. assert issubclass(model, ABIDModel)
  621. return model.abid_prefix
  622. # prefix might be obj_ or missing, fuzzy-search to find any object that matches
  623. return find_obj_from_abid_rand(abid)[0].abid_prefix
  624. def find_model_from_abid_prefix(prefix: str) -> type[ABIDModel] | None:
  625. """
  626. Return the Django Model that corresponds to a given ABID prefix.
  627. e.g. 'tag_' -> core.models.Tag
  628. """
  629. prefix = abid_part_from_prefix(prefix) # snp_... -> snp_
  630. import django.apps
  631. for model in django.apps.apps.get_models():
  632. if not issubclass(model, ABIDModel): continue # skip non-ABID-enabled models
  633. if not hasattr(model, 'objects'): continue # skip abstract models
  634. if (model.abid_prefix == prefix):
  635. return model
  636. return None
  637. def find_model_from_abid(abid: ABID) -> type[models.Model] | None:
  638. """
  639. Shortcut for find_model_from_abid_prefix(abid.prefix)
  640. """
  641. return find_model_from_abid_prefix(abid.prefix)
  642. def find_obj_from_abid_rand(rand: Union[ABID, str], model=None) -> List[ABIDModel]:
  643. """
  644. This is a huge hack and should only be used for debugging, never use this in real code / expose this to users.
  645. Find an object corresponding to an ABID by exhaustively searching using its random suffix (slow).
  646. e.g. 'obj_....................JYRPAQ' -> Snapshot('snp_01BJQMF54D093DXEAWZ6JYRPAQ')
  647. """
  648. raise Exception('THIS FUNCTION IS FOR DEBUGGING ONLY, comment this line out temporarily when you need to use it, but dont commit it!')
  649. # convert str to ABID if necessary
  650. if isinstance(rand, ABID):
  651. abid: ABID = rand
  652. else:
  653. rand = str(rand)
  654. if len(rand) < ABID_SUFFIX_LEN:
  655. padding_needed = ABID_SUFFIX_LEN - len(rand)
  656. rand = ('0'*padding_needed) + rand
  657. abid = ABID.parse(rand)
  658. import django.apps
  659. partial_matches: List[ABIDModel] = []
  660. models_to_try = cast(Set[type[models.Model]], set(filter(bool, (
  661. model,
  662. find_model_from_abid(abid),
  663. *django.apps.apps.get_models(),
  664. ))))
  665. # print(abid, abid.rand, abid.uuid, models_to_try)
  666. for model in models_to_try:
  667. if not issubclass(model, ABIDModel): continue # skip Models that arent ABID-enabled
  668. if not hasattr(model, 'objects'): continue # skip abstract Models
  669. assert hasattr(model, 'objects') # force-fix for type hint nit about missing manager https://github.com/typeddjango/django-stubs/issues/1684
  670. # continue on to try fuzzy searching by randomness portion derived from uuid field
  671. try:
  672. qs = []
  673. if hasattr(model, 'abid'):
  674. qs = model.objects.filter(abid__endswith=abid.rand)
  675. elif hasattr(model, 'uuid'):
  676. qs = model.objects.filter(uuid__endswith=str(abid.uuid)[-ABID_RAND_LEN:])
  677. elif hasattr(model, 'id'):
  678. # NOTE: this only works on SQLite where every column is a string
  679. # other DB backends like postgres dont let you do __endswith if this is a BigAutoInteger field
  680. # try to search for uuid=...-2354352
  681. # try to search for id=...2354352
  682. # try to search for id=2354352
  683. qs = model.objects.filter(
  684. models.Q(id__endswith=str(abid.uuid)[-ABID_RAND_LEN:])
  685. | models.Q(id__endswith=abid.rand)
  686. | models.Q(id__startswith=str(int(abid.rand)) if abid.rand.isdigit() else abid.rand)
  687. )
  688. for obj in qs:
  689. if abid in (str(obj.ABID), str(obj.id), str(obj.pk), str(obj.abid)):
  690. # found exact match, no need to keep iterating
  691. return [obj]
  692. partial_matches.append(obj)
  693. except OperationalError as err:
  694. print(f'[!] WARNING: Got error while trying to iterate through QuerySet for {model}:', err, '\n')
  695. return partial_matches
  696. def find_obj_from_abid(abid: ABID, model=None, fuzzy=False) -> Any:
  697. """
  698. Find an object with a given ABID by filtering possible models for a matching abid/uuid/id (fast).
  699. e.g. 'snp_01BJQMF54D093DXEAWZ6JYRPAQ' -> Snapshot('snp_01BJQMF54D093DXEAWZ6JYRPAQ')
  700. """
  701. model = model or find_model_from_abid(abid)
  702. assert model, f'Could not find model that could match this ABID type: {abid}'
  703. try:
  704. if hasattr(model, 'abid'):
  705. return model.objects.get(abid__endswith=abid.suffix)
  706. if hasattr(model, 'uuid'):
  707. return model.objects.get(uuid=abid.uuid)
  708. return model.objects.get(id=abid.uuid)
  709. except model.DoesNotExist:
  710. # if the model has an abid field then it shouldve matched, pointless to fuzzy search in that case
  711. if hasattr(model, 'abid') or (not fuzzy):
  712. raise
  713. # continue on to try fuzzy searching by randomness portion derived from uuid field
  714. match_by_rand = find_obj_from_abid_rand(abid, model=model)
  715. if match_by_rand:
  716. if match_by_rand[0].abid_prefix != abid.prefix:
  717. print(f'[!] WARNING: fetched object {match_by_rand} even though prefix {abid.prefix} doesnt match!', abid, '\n')
  718. return match_by_rand
  719. raise model.DoesNotExist