0001_initial.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. # Generated by hand on 2025-12-29
  2. # Creates Crawl and CrawlSchedule tables using raw SQL
  3. from django.db import migrations, models
  4. import django.db.models.deletion
  5. import django.utils.timezone
  6. import django.core.validators
  7. from django.conf import settings
  8. from archivebox.uuid_compat import uuid7
  9. from archivebox.base_models.models import get_or_create_system_user_pk
  10. class Migration(migrations.Migration):
  11. initial = True
  12. dependencies = [
  13. ('auth', '0012_alter_user_first_name_max_length'),
  14. migrations.swappable_dependency(settings.AUTH_USER_MODEL),
  15. ]
  16. operations = [
  17. migrations.SeparateDatabaseAndState(
  18. database_operations=[
  19. migrations.RunSQL(
  20. sql="""
  21. -- Create crawls_crawlschedule table first (circular FK will be added later)
  22. CREATE TABLE IF NOT EXISTS crawls_crawlschedule (
  23. id TEXT PRIMARY KEY NOT NULL,
  24. created_at DATETIME NOT NULL,
  25. modified_at DATETIME NOT NULL,
  26. num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
  27. num_uses_failed INTEGER NOT NULL DEFAULT 0,
  28. schedule VARCHAR(64) NOT NULL,
  29. is_enabled BOOLEAN NOT NULL DEFAULT 1,
  30. label VARCHAR(64) NOT NULL DEFAULT '',
  31. notes TEXT NOT NULL DEFAULT '',
  32. template_id TEXT NOT NULL,
  33. created_by_id INTEGER NOT NULL,
  34. FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE
  35. );
  36. CREATE INDEX IF NOT EXISTS crawls_crawlschedule_created_at_idx ON crawls_crawlschedule(created_at);
  37. CREATE INDEX IF NOT EXISTS crawls_crawlschedule_created_by_id_idx ON crawls_crawlschedule(created_by_id);
  38. CREATE INDEX IF NOT EXISTS crawls_crawlschedule_template_id_idx ON crawls_crawlschedule(template_id);
  39. -- Create crawls_crawl table
  40. CREATE TABLE IF NOT EXISTS crawls_crawl (
  41. id TEXT PRIMARY KEY NOT NULL,
  42. created_at DATETIME NOT NULL,
  43. modified_at DATETIME NOT NULL,
  44. num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
  45. num_uses_failed INTEGER NOT NULL DEFAULT 0,
  46. urls TEXT NOT NULL,
  47. config TEXT,
  48. max_depth INTEGER NOT NULL DEFAULT 0,
  49. tags_str VARCHAR(1024) NOT NULL DEFAULT '',
  50. persona_id TEXT,
  51. label VARCHAR(64) NOT NULL DEFAULT '',
  52. notes TEXT NOT NULL DEFAULT '',
  53. output_dir VARCHAR(512) NOT NULL DEFAULT '',
  54. status VARCHAR(15) NOT NULL DEFAULT 'queued',
  55. retry_at DATETIME,
  56. created_by_id INTEGER NOT NULL,
  57. schedule_id TEXT,
  58. FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE,
  59. FOREIGN KEY (schedule_id) REFERENCES crawls_crawlschedule(id) ON DELETE SET NULL
  60. );
  61. CREATE INDEX IF NOT EXISTS crawls_crawl_status_idx ON crawls_crawl(status);
  62. CREATE INDEX IF NOT EXISTS crawls_crawl_retry_at_idx ON crawls_crawl(retry_at);
  63. CREATE INDEX IF NOT EXISTS crawls_crawl_created_at_idx ON crawls_crawl(created_at);
  64. CREATE INDEX IF NOT EXISTS crawls_crawl_created_by_id_idx ON crawls_crawl(created_by_id);
  65. CREATE INDEX IF NOT EXISTS crawls_crawl_schedule_id_idx ON crawls_crawl(schedule_id);
  66. """,
  67. reverse_sql="""
  68. DROP TABLE IF EXISTS crawls_crawl;
  69. DROP TABLE IF EXISTS crawls_crawlschedule;
  70. """
  71. ),
  72. ],
  73. state_operations=[
  74. migrations.CreateModel(
  75. name='CrawlSchedule',
  76. fields=[
  77. ('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
  78. ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
  79. ('modified_at', models.DateTimeField(auto_now=True)),
  80. ('num_uses_succeeded', models.PositiveIntegerField(default=0)),
  81. ('num_uses_failed', models.PositiveIntegerField(default=0)),
  82. ('schedule', models.CharField(max_length=64)),
  83. ('is_enabled', models.BooleanField(default=True)),
  84. ('label', models.CharField(blank=True, default='', max_length=64)),
  85. ('notes', models.TextField(blank=True, default='')),
  86. ('created_by', models.ForeignKey(default=get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
  87. ],
  88. options={
  89. 'verbose_name': 'Scheduled Crawl',
  90. 'verbose_name_plural': 'Scheduled Crawls',
  91. 'app_label': 'crawls',
  92. },
  93. ),
  94. migrations.CreateModel(
  95. name='Crawl',
  96. fields=[
  97. ('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
  98. ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
  99. ('modified_at', models.DateTimeField(auto_now=True)),
  100. ('num_uses_succeeded', models.PositiveIntegerField(default=0)),
  101. ('num_uses_failed', models.PositiveIntegerField(default=0)),
  102. ('urls', models.TextField(help_text='Newline-separated list of URLs to crawl')),
  103. ('config', models.JSONField(blank=True, default=dict, null=True)),
  104. ('max_depth', models.PositiveSmallIntegerField(default=0, validators=[django.core.validators.MinValueValidator(0), django.core.validators.MaxValueValidator(4)])),
  105. ('tags_str', models.CharField(blank=True, default='', max_length=1024)),
  106. ('persona_id', models.UUIDField(blank=True, null=True)),
  107. ('label', models.CharField(blank=True, default='', max_length=64)),
  108. ('notes', models.TextField(blank=True, default='')),
  109. ('output_dir', models.CharField(blank=True, default='', max_length=512)),
  110. ('status', models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], db_index=True, default='queued', max_length=15)),
  111. ('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True)),
  112. ('created_by', models.ForeignKey(default=get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
  113. ('schedule', models.ForeignKey(blank=True, editable=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='crawls.crawlschedule')),
  114. ],
  115. options={
  116. 'verbose_name': 'Crawl',
  117. 'verbose_name_plural': 'Crawls',
  118. 'app_label': 'crawls',
  119. },
  120. ),
  121. migrations.AddField(
  122. model_name='crawlschedule',
  123. name='template',
  124. field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='crawls.crawl'),
  125. ),
  126. ],
  127. ),
  128. ]