0024_assign_default_crawl.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. # Generated by hand on 2025-12-29
  2. # Creates a default crawl for v0.7.2 migrated snapshots and makes crawl_id NOT NULL
  3. from django.db import migrations, models
  4. import uuid
  5. def create_default_crawl_and_assign_snapshots(apps, schema_editor):
  6. """
  7. Create a default crawl for migrated snapshots and assign all snapshots without a crawl to it.
  8. Uses raw SQL because the app registry isn't fully populated during migrations.
  9. """
  10. from django.db import connection
  11. import uuid as uuid_lib
  12. from datetime import datetime
  13. cursor = connection.cursor()
  14. # Check if there are any snapshots without a crawl
  15. cursor.execute("SELECT COUNT(*) FROM core_snapshot WHERE crawl_id IS NULL")
  16. snapshots_without_crawl = cursor.fetchone()[0]
  17. if snapshots_without_crawl == 0:
  18. print('✓ Fresh install or all snapshots already have crawls')
  19. return
  20. # Get or create system user (pk=1)
  21. cursor.execute("SELECT id FROM auth_user WHERE id = 1")
  22. if not cursor.fetchone():
  23. cursor.execute("""
  24. INSERT INTO auth_user (id, password, is_superuser, username, first_name, last_name, email, is_staff, is_active, date_joined)
  25. VALUES (1, '!', 1, 'system', '', '', '', 1, 1, ?)
  26. """, [datetime.now().isoformat()])
  27. # Create a default crawl for migrated snapshots
  28. # At this point crawls_crawl is guaranteed to have v0.9.0 schema (crawls/0002 ran first)
  29. crawl_id = str(uuid_lib.uuid4())
  30. now = datetime.now().isoformat()
  31. cursor.execute("""
  32. INSERT INTO crawls_crawl (
  33. id, created_at, modified_at, num_uses_succeeded, num_uses_failed,
  34. urls, max_depth, tags_str, label, notes, output_dir,
  35. status, retry_at, created_by_id, schedule_id, config, persona_id
  36. ) VALUES (?, ?, ?, 0, 0, '', 0, '', 'Migrated from v0.7.2/v0.8.6',
  37. 'Auto-created crawl for migrated snapshots', '',
  38. 'sealed', ?, 1, NULL, '{}', NULL)
  39. """, [crawl_id, now, now, now])
  40. # Assign all snapshots without a crawl to the default crawl
  41. cursor.execute("UPDATE core_snapshot SET crawl_id = ? WHERE crawl_id IS NULL", [crawl_id])
  42. print(f'✓ Assigned {snapshots_without_crawl} snapshots to default crawl {crawl_id}')
  43. class Migration(migrations.Migration):
  44. dependencies = [
  45. ('core', '0023_upgrade_to_0_9_0'),
  46. ('crawls', '0002_upgrade_from_0_8_6'),
  47. ('auth', '0012_alter_user_first_name_max_length'),
  48. ]
  49. operations = [
  50. migrations.RunPython(
  51. create_default_crawl_and_assign_snapshots,
  52. reverse_code=migrations.RunPython.noop,
  53. ),
  54. migrations.SeparateDatabaseAndState(
  55. database_operations=[
  56. # Now make crawl_id NOT NULL
  57. migrations.RunSQL(
  58. sql="""
  59. -- Rebuild snapshot table with NOT NULL crawl_id
  60. CREATE TABLE core_snapshot_final (
  61. id TEXT PRIMARY KEY NOT NULL,
  62. created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
  63. modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
  64. url TEXT NOT NULL,
  65. timestamp VARCHAR(32) NOT NULL UNIQUE,
  66. bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
  67. crawl_id TEXT NOT NULL,
  68. parent_snapshot_id TEXT,
  69. title VARCHAR(512),
  70. downloaded_at DATETIME,
  71. depth INTEGER NOT NULL DEFAULT 0,
  72. fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0',
  73. config TEXT NOT NULL DEFAULT '{}',
  74. notes TEXT NOT NULL DEFAULT '',
  75. num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
  76. num_uses_failed INTEGER NOT NULL DEFAULT 0,
  77. status VARCHAR(15) NOT NULL DEFAULT 'queued',
  78. retry_at DATETIME,
  79. current_step INTEGER NOT NULL DEFAULT 0,
  80. FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE,
  81. FOREIGN KEY (parent_snapshot_id) REFERENCES core_snapshot(id) ON DELETE SET NULL
  82. );
  83. INSERT INTO core_snapshot_final (
  84. id, url, timestamp, title,
  85. bookmarked_at, created_at, modified_at,
  86. crawl_id, parent_snapshot_id,
  87. downloaded_at, depth, fs_version,
  88. config, notes,
  89. num_uses_succeeded, num_uses_failed,
  90. status, retry_at, current_step
  91. )
  92. SELECT
  93. id, url, timestamp, title,
  94. bookmarked_at, created_at, modified_at,
  95. crawl_id, parent_snapshot_id,
  96. downloaded_at, depth, fs_version,
  97. COALESCE(config, '{}'), COALESCE(notes, ''),
  98. num_uses_succeeded, num_uses_failed,
  99. status, retry_at, current_step
  100. FROM core_snapshot;
  101. DROP TABLE core_snapshot;
  102. ALTER TABLE core_snapshot_final RENAME TO core_snapshot;
  103. CREATE INDEX core_snapshot_url_idx ON core_snapshot(url);
  104. CREATE INDEX core_snapshot_timestamp_idx ON core_snapshot(timestamp);
  105. CREATE INDEX core_snapshot_bookmarked_at_idx ON core_snapshot(bookmarked_at);
  106. CREATE INDEX core_snapshot_crawl_id_idx ON core_snapshot(crawl_id);
  107. CREATE INDEX core_snapshot_status_idx ON core_snapshot(status);
  108. CREATE INDEX core_snapshot_retry_at_idx ON core_snapshot(retry_at);
  109. CREATE INDEX core_snapshot_created_at_idx ON core_snapshot(created_at);
  110. CREATE UNIQUE INDEX core_snapshot_url_crawl_unique ON core_snapshot(url, crawl_id);
  111. """,
  112. reverse_sql=migrations.RunSQL.noop,
  113. ),
  114. ],
  115. state_operations=[
  116. migrations.AddField(
  117. model_name='snapshot',
  118. name='crawl',
  119. field=models.ForeignKey(
  120. on_delete=models.deletion.CASCADE,
  121. to='crawls.crawl',
  122. help_text='Crawl that created this snapshot'
  123. ),
  124. ),
  125. ],
  126. ),
  127. ]