sql.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. __package__ = 'archivebox.index'
  2. import re
  3. from io import StringIO
  4. from pathlib import Path
  5. from typing import List, Tuple, Iterator
  6. from django.db.models import QuerySet
  7. from django.db import transaction
  8. from archivebox.misc.util import enforce_types, parse_date
  9. from archivebox.config import DATA_DIR
  10. from archivebox.config.common import GENERAL_CONFIG
  11. from .schema import Link
  12. ### Main Links Index
  13. @enforce_types
  14. def parse_sql_main_index(out_dir: Path=DATA_DIR) -> Iterator[Link]:
  15. from core.models import Snapshot
  16. return (
  17. Link.from_json(page.as_json(*Snapshot.keys))
  18. for page in Snapshot.objects.all()
  19. )
  20. @enforce_types
  21. def remove_from_sql_main_index(snapshots: QuerySet, atomic: bool=False, out_dir: Path=DATA_DIR) -> None:
  22. if atomic:
  23. with transaction.atomic():
  24. return snapshots.delete()
  25. return snapshots.delete()
  26. @enforce_types
  27. def write_link_to_sql_index(link: Link, created_by_id: int | None=None):
  28. from core.models import Snapshot, ArchiveResult
  29. from archivebox.base_models.models import get_or_create_system_user_pk
  30. info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
  31. info['created_by_id'] = created_by_id or get_or_create_system_user_pk()
  32. tag_list = list(dict.fromkeys(
  33. tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, link.tags or '')
  34. ))
  35. info.pop('tags')
  36. try:
  37. snapshot = Snapshot.objects.get(url=link.url)
  38. info["timestamp"] = snapshot.timestamp
  39. except Snapshot.DoesNotExist:
  40. while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
  41. info["timestamp"] = str(float(info["timestamp"]) + 1.0)
  42. snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info)
  43. snapshot.save_tags(tag_list)
  44. for extractor, entries in link.history.items():
  45. for entry in entries:
  46. if isinstance(entry, dict):
  47. result, _ = ArchiveResult.objects.get_or_create(
  48. snapshot_id=snapshot.pk,
  49. extractor=extractor,
  50. start_ts=parse_date(entry['start_ts']),
  51. defaults={
  52. 'end_ts': parse_date(entry['end_ts']),
  53. 'cmd': entry['cmd'],
  54. 'output': entry['output'],
  55. 'cmd_version': entry.get('cmd_version') or 'unknown',
  56. 'pwd': entry['pwd'],
  57. 'status': entry['status'],
  58. 'created_by_id': snapshot.created_by_id,
  59. }
  60. )
  61. else:
  62. result, _ = ArchiveResult.objects.update_or_create(
  63. snapshot_id=snapshot.pk,
  64. extractor=extractor,
  65. start_ts=parse_date(entry.start_ts),
  66. defaults={
  67. 'end_ts': parse_date(entry.end_ts),
  68. 'cmd': entry.cmd,
  69. 'output': entry.output,
  70. 'cmd_version': entry.cmd_version or 'unknown',
  71. 'pwd': entry.pwd,
  72. 'status': entry.status,
  73. 'created_by_id': snapshot.created_by_id,
  74. }
  75. )
  76. return snapshot
  77. @enforce_types
  78. def write_sql_main_index(links: List[Link], out_dir: Path=DATA_DIR, created_by_id: int | None=None) -> None:
  79. for link in links:
  80. # with transaction.atomic():
  81. # write_link_to_sql_index(link)
  82. write_link_to_sql_index(link, created_by_id=created_by_id)
  83. @enforce_types
  84. def write_sql_link_details(link: Link, out_dir: Path=DATA_DIR, created_by_id: int | None=None) -> None:
  85. from core.models import Snapshot
  86. # with transaction.atomic():
  87. # try:
  88. # snap = Snapshot.objects.get(url=link.url)
  89. # except Snapshot.DoesNotExist:
  90. # snap = write_link_to_sql_index(link)
  91. # snap.title = link.title
  92. try:
  93. snap = Snapshot.objects.get(url=link.url)
  94. except Snapshot.DoesNotExist:
  95. snap = write_link_to_sql_index(link, created_by_id=created_by_id)
  96. snap.title = link.title
  97. tag_list = list(
  98. {tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, link.tags or '')}
  99. | set(snap.tags.values_list('name', flat=True))
  100. )
  101. snap.save()
  102. snap.save_tags(tag_list)
  103. @enforce_types
  104. def list_migrations(out_dir: Path=DATA_DIR) -> List[Tuple[bool, str]]:
  105. from django.core.management import call_command
  106. out = StringIO()
  107. call_command("showmigrations", list=True, stdout=out)
  108. out.seek(0)
  109. migrations = []
  110. for line in out.readlines():
  111. if line.strip() and ']' in line:
  112. status_str, name_str = line.strip().split(']', 1)
  113. is_applied = 'X' in status_str
  114. migration_name = name_str.strip()
  115. migrations.append((is_applied, migration_name))
  116. return migrations
  117. @enforce_types
  118. def apply_migrations(out_dir: Path=DATA_DIR) -> List[str]:
  119. from django.core.management import call_command
  120. out1, out2 = StringIO(), StringIO()
  121. call_command("migrate", interactive=False, database='default', stdout=out1)
  122. out1.seek(0)
  123. call_command("migrate", "huey_monitor", interactive=False, database='queue', stdout=out2)
  124. out2.seek(0)
  125. return [
  126. line.strip() for line in out1.readlines() + out2.readlines() if line.strip()
  127. ]
  128. @enforce_types
  129. def get_admins(out_dir: Path=DATA_DIR) -> List[str]:
  130. from django.contrib.auth.models import User
  131. return User.objects.filter(is_superuser=True).exclude(username='system')