| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191 |
- #!/usr/bin/env python3
- """
- Helper functions and schema definitions for migration tests.
- This module provides:
- - Schema definitions for each major ArchiveBox version (0.4.x, 0.7.x, 0.8.x)
- - Data seeding functions to populate test databases
- - Helper functions to run archivebox commands and verify results
- """
- import os
- import sys
- import json
- import sqlite3
- import subprocess
- from pathlib import Path
- from datetime import datetime, timezone
- from typing import Dict, List, Tuple
- from uuid import uuid4
- # =============================================================================
- # Schema Definitions for Each Version
- # =============================================================================
- SCHEMA_0_4 = """
- -- Django system tables (minimal)
- CREATE TABLE IF NOT EXISTS django_migrations (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- app VARCHAR(255) NOT NULL,
- name VARCHAR(255) NOT NULL,
- applied DATETIME NOT NULL
- );
- -- Core tables for 0.4.x
- CREATE TABLE IF NOT EXISTS core_snapshot (
- id CHAR(32) PRIMARY KEY,
- url VARCHAR(2000) NOT NULL UNIQUE,
- timestamp VARCHAR(32) NOT NULL UNIQUE,
- title VARCHAR(128),
- tags VARCHAR(256),
- added DATETIME NOT NULL,
- updated DATETIME
- );
- CREATE INDEX IF NOT EXISTS core_snapshot_url ON core_snapshot(url);
- CREATE INDEX IF NOT EXISTS core_snapshot_timestamp ON core_snapshot(timestamp);
- CREATE INDEX IF NOT EXISTS core_snapshot_added ON core_snapshot(added);
- """
- SCHEMA_0_7 = """
- -- Django system tables (complete for 0.7.x)
- CREATE TABLE IF NOT EXISTS django_migrations (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- app VARCHAR(255) NOT NULL,
- name VARCHAR(255) NOT NULL,
- applied DATETIME NOT NULL
- );
- CREATE TABLE IF NOT EXISTS django_content_type (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- app_label VARCHAR(100) NOT NULL,
- model VARCHAR(100) NOT NULL,
- UNIQUE(app_label, model)
- );
- CREATE TABLE IF NOT EXISTS auth_permission (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- name VARCHAR(255) NOT NULL,
- content_type_id INTEGER NOT NULL REFERENCES django_content_type(id),
- codename VARCHAR(100) NOT NULL,
- UNIQUE(content_type_id, codename)
- );
- CREATE TABLE IF NOT EXISTS auth_group (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- name VARCHAR(150) NOT NULL UNIQUE
- );
- CREATE TABLE IF NOT EXISTS auth_group_permissions (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- group_id INTEGER NOT NULL REFERENCES auth_group(id),
- permission_id INTEGER NOT NULL REFERENCES auth_permission(id),
- UNIQUE(group_id, permission_id)
- );
- CREATE TABLE IF NOT EXISTS auth_user (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- password VARCHAR(128) NOT NULL,
- last_login DATETIME,
- is_superuser BOOL NOT NULL,
- username VARCHAR(150) NOT NULL UNIQUE,
- first_name VARCHAR(150) NOT NULL,
- last_name VARCHAR(150) NOT NULL,
- email VARCHAR(254) NOT NULL,
- is_staff BOOL NOT NULL,
- is_active BOOL NOT NULL,
- date_joined DATETIME NOT NULL
- );
- CREATE TABLE IF NOT EXISTS auth_user_groups (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- user_id INTEGER NOT NULL REFERENCES auth_user(id),
- group_id INTEGER NOT NULL REFERENCES auth_group(id),
- UNIQUE(user_id, group_id)
- );
- CREATE TABLE IF NOT EXISTS auth_user_user_permissions (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- user_id INTEGER NOT NULL REFERENCES auth_user(id),
- permission_id INTEGER NOT NULL REFERENCES auth_permission(id),
- UNIQUE(user_id, permission_id)
- );
- CREATE TABLE IF NOT EXISTS django_admin_log (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- action_time DATETIME NOT NULL,
- object_id TEXT,
- object_repr VARCHAR(200) NOT NULL,
- action_flag SMALLINT UNSIGNED NOT NULL,
- change_message TEXT NOT NULL,
- content_type_id INTEGER REFERENCES django_content_type(id),
- user_id INTEGER NOT NULL REFERENCES auth_user(id)
- );
- CREATE TABLE IF NOT EXISTS django_session (
- session_key VARCHAR(40) NOT NULL PRIMARY KEY,
- session_data TEXT NOT NULL,
- expire_date DATETIME NOT NULL
- );
- -- Core tables for 0.7.x
- CREATE TABLE IF NOT EXISTS core_tag (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- name VARCHAR(100) NOT NULL UNIQUE,
- slug VARCHAR(100) NOT NULL UNIQUE
- );
- CREATE TABLE IF NOT EXISTS core_snapshot (
- id CHAR(32) PRIMARY KEY,
- url VARCHAR(2000) NOT NULL UNIQUE,
- timestamp VARCHAR(32) NOT NULL UNIQUE,
- title VARCHAR(512),
- added DATETIME NOT NULL,
- updated DATETIME
- );
- CREATE INDEX IF NOT EXISTS core_snapshot_url ON core_snapshot(url);
- CREATE INDEX IF NOT EXISTS core_snapshot_timestamp ON core_snapshot(timestamp);
- CREATE INDEX IF NOT EXISTS core_snapshot_added ON core_snapshot(added);
- -- Many-to-many for snapshot tags
- CREATE TABLE IF NOT EXISTS core_snapshot_tags (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- snapshot_id CHAR(32) NOT NULL REFERENCES core_snapshot(id),
- tag_id INTEGER NOT NULL REFERENCES core_tag(id),
- UNIQUE(snapshot_id, tag_id)
- );
- CREATE TABLE IF NOT EXISTS core_archiveresult (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- snapshot_id CHAR(32) NOT NULL REFERENCES core_snapshot(id),
- extractor VARCHAR(32) NOT NULL,
- cmd TEXT,
- pwd VARCHAR(256),
- cmd_version VARCHAR(128),
- output VARCHAR(1024),
- start_ts DATETIME,
- end_ts DATETIME,
- status VARCHAR(16) NOT NULL
- );
- CREATE INDEX IF NOT EXISTS core_archiveresult_snapshot ON core_archiveresult(snapshot_id);
- CREATE INDEX IF NOT EXISTS core_archiveresult_extractor ON core_archiveresult(extractor);
- -- Insert required content types
- INSERT INTO django_content_type (app_label, model) VALUES
- ('contenttypes', 'contenttype'),
- ('auth', 'permission'),
- ('auth', 'group'),
- ('auth', 'user'),
- ('admin', 'logentry'),
- ('sessions', 'session'),
- ('core', 'snapshot'),
- ('core', 'archiveresult'),
- ('core', 'tag');
- """
- SCHEMA_0_8 = """
- -- Django system tables (complete for 0.8.x)
- CREATE TABLE IF NOT EXISTS django_migrations (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- app VARCHAR(255) NOT NULL,
- name VARCHAR(255) NOT NULL,
- applied DATETIME NOT NULL
- );
- CREATE TABLE IF NOT EXISTS django_content_type (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- app_label VARCHAR(100) NOT NULL,
- model VARCHAR(100) NOT NULL,
- UNIQUE(app_label, model)
- );
- CREATE TABLE IF NOT EXISTS auth_permission (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- name VARCHAR(255) NOT NULL,
- content_type_id INTEGER NOT NULL REFERENCES django_content_type(id),
- codename VARCHAR(100) NOT NULL,
- UNIQUE(content_type_id, codename)
- );
- CREATE TABLE IF NOT EXISTS auth_group (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- name VARCHAR(150) NOT NULL UNIQUE
- );
- CREATE TABLE IF NOT EXISTS auth_group_permissions (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- group_id INTEGER NOT NULL REFERENCES auth_group(id),
- permission_id INTEGER NOT NULL REFERENCES auth_permission(id),
- UNIQUE(group_id, permission_id)
- );
- CREATE TABLE IF NOT EXISTS auth_user (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- password VARCHAR(128) NOT NULL,
- last_login DATETIME,
- is_superuser BOOL NOT NULL,
- username VARCHAR(150) NOT NULL UNIQUE,
- first_name VARCHAR(150) NOT NULL,
- last_name VARCHAR(150) NOT NULL,
- email VARCHAR(254) NOT NULL,
- is_staff BOOL NOT NULL,
- is_active BOOL NOT NULL,
- date_joined DATETIME NOT NULL
- );
- CREATE TABLE IF NOT EXISTS auth_user_groups (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- user_id INTEGER NOT NULL REFERENCES auth_user(id),
- group_id INTEGER NOT NULL REFERENCES auth_group(id),
- UNIQUE(user_id, group_id)
- );
- CREATE TABLE IF NOT EXISTS auth_user_user_permissions (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- user_id INTEGER NOT NULL REFERENCES auth_user(id),
- permission_id INTEGER NOT NULL REFERENCES auth_permission(id),
- UNIQUE(user_id, permission_id)
- );
- CREATE TABLE IF NOT EXISTS django_admin_log (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- action_time DATETIME NOT NULL,
- object_id TEXT,
- object_repr VARCHAR(200) NOT NULL,
- action_flag SMALLINT UNSIGNED NOT NULL,
- change_message TEXT NOT NULL,
- content_type_id INTEGER REFERENCES django_content_type(id),
- user_id INTEGER NOT NULL REFERENCES auth_user(id)
- );
- CREATE TABLE IF NOT EXISTS django_session (
- session_key VARCHAR(40) NOT NULL PRIMARY KEY,
- session_data TEXT NOT NULL,
- expire_date DATETIME NOT NULL
- );
- -- Machine app tables (added in 0.8.x)
- CREATE TABLE IF NOT EXISTS machine_machine (
- id CHAR(36) PRIMARY KEY,
- created_at DATETIME NOT NULL,
- modified_at DATETIME,
- guid VARCHAR(64) NOT NULL UNIQUE,
- hostname VARCHAR(63),
- hw_in_docker BOOLEAN NOT NULL DEFAULT 0,
- hw_in_vm BOOLEAN NOT NULL DEFAULT 0,
- hw_manufacturer VARCHAR(63),
- hw_product VARCHAR(63),
- hw_uuid VARCHAR(255),
- os_arch VARCHAR(15),
- os_family VARCHAR(15),
- os_platform VARCHAR(63),
- os_release VARCHAR(63),
- os_kernel VARCHAR(255),
- stats TEXT DEFAULT '{}',
- config TEXT DEFAULT '{}',
- num_uses_failed INTEGER NOT NULL DEFAULT 0,
- num_uses_succeeded INTEGER NOT NULL DEFAULT 0
- );
- CREATE TABLE IF NOT EXISTS machine_networkinterface (
- id CHAR(36) PRIMARY KEY,
- created_at DATETIME NOT NULL,
- modified_at DATETIME,
- machine_id CHAR(36) NOT NULL REFERENCES machine_machine(id),
- mac_address VARCHAR(17),
- ip_public VARCHAR(45),
- ip_local VARCHAR(45),
- dns_server VARCHAR(45),
- hostname VARCHAR(63),
- iface VARCHAR(15),
- isp VARCHAR(63),
- city VARCHAR(63),
- region VARCHAR(63),
- country VARCHAR(63),
- num_uses_failed INTEGER NOT NULL DEFAULT 0,
- num_uses_succeeded INTEGER NOT NULL DEFAULT 0
- );
- CREATE TABLE IF NOT EXISTS machine_dependency (
- id CHAR(36) PRIMARY KEY,
- created_at DATETIME NOT NULL,
- modified_at DATETIME,
- bin_name VARCHAR(63) NOT NULL UNIQUE,
- bin_providers VARCHAR(127) NOT NULL DEFAULT '*',
- custom_cmds TEXT DEFAULT '{}',
- config TEXT DEFAULT '{}'
- );
- CREATE TABLE IF NOT EXISTS machine_binary (
- id CHAR(36) PRIMARY KEY,
- created_at DATETIME NOT NULL,
- modified_at DATETIME,
- machine_id CHAR(36) REFERENCES machine_machine(id),
- dependency_id CHAR(36) REFERENCES machine_dependency(id),
- name VARCHAR(63),
- binprovider VARCHAR(31),
- abspath VARCHAR(255),
- version VARCHAR(32),
- sha256 VARCHAR(64),
- num_uses_failed INTEGER NOT NULL DEFAULT 0,
- num_uses_succeeded INTEGER NOT NULL DEFAULT 0
- );
- -- API app tables (added in 0.8.x)
- CREATE TABLE IF NOT EXISTS api_apitoken (
- id CHAR(36) PRIMARY KEY,
- created_by_id INTEGER NOT NULL REFERENCES auth_user(id),
- created_at DATETIME NOT NULL,
- modified_at DATETIME,
- token VARCHAR(32) NOT NULL UNIQUE,
- expires DATETIME
- );
- CREATE TABLE IF NOT EXISTS api_outboundwebhook (
- id CHAR(36) PRIMARY KEY,
- created_by_id INTEGER NOT NULL REFERENCES auth_user(id),
- created_at DATETIME NOT NULL,
- modified_at DATETIME,
- name VARCHAR(255) NOT NULL DEFAULT '',
- signal VARCHAR(255) NOT NULL,
- ref VARCHAR(255) NOT NULL,
- endpoint VARCHAR(2083) NOT NULL,
- headers TEXT DEFAULT '{}',
- auth_token VARCHAR(4000) NOT NULL DEFAULT '',
- enabled BOOLEAN NOT NULL DEFAULT 1,
- keep_last_response BOOLEAN NOT NULL DEFAULT 0,
- last_response TEXT NOT NULL DEFAULT '',
- last_success DATETIME,
- last_failure DATETIME,
- num_uses_failed INTEGER NOT NULL DEFAULT 0,
- num_uses_succeeded INTEGER NOT NULL DEFAULT 0
- );
- -- Core Tag table (AutoField PK in 0.8.x)
- CREATE TABLE IF NOT EXISTS core_tag (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- name VARCHAR(100) NOT NULL UNIQUE,
- slug VARCHAR(100) NOT NULL UNIQUE,
- created_at DATETIME,
- modified_at DATETIME,
- created_by_id INTEGER REFERENCES auth_user(id)
- );
- -- Crawls tables (new in 0.8.x)
- CREATE TABLE IF NOT EXISTS crawls_crawlschedule (
- id CHAR(36) PRIMARY KEY,
- created_at DATETIME NOT NULL,
- created_by_id INTEGER NOT NULL REFERENCES auth_user(id),
- modified_at DATETIME,
- schedule VARCHAR(64) NOT NULL,
- is_enabled BOOLEAN NOT NULL DEFAULT 1,
- label VARCHAR(64) NOT NULL DEFAULT '',
- notes TEXT NOT NULL DEFAULT '',
- template_id CHAR(36) REFERENCES crawls_crawl(id),
- num_uses_failed INTEGER NOT NULL DEFAULT 0,
- num_uses_succeeded INTEGER NOT NULL DEFAULT 0
- );
- CREATE TABLE IF NOT EXISTS crawls_crawl (
- id CHAR(36) PRIMARY KEY,
- created_at DATETIME NOT NULL,
- created_by_id INTEGER NOT NULL REFERENCES auth_user(id),
- modified_at DATETIME,
- urls TEXT NOT NULL,
- config TEXT DEFAULT '{}',
- max_depth SMALLINT UNSIGNED NOT NULL DEFAULT 0,
- tags_str VARCHAR(1024) NOT NULL DEFAULT '',
- persona_id CHAR(36),
- label VARCHAR(64) NOT NULL DEFAULT '',
- notes TEXT NOT NULL DEFAULT '',
- schedule_id CHAR(36),
- output_dir VARCHAR(256) NOT NULL DEFAULT '',
- status VARCHAR(16) NOT NULL DEFAULT 'queued',
- retry_at DATETIME,
- num_uses_failed INTEGER NOT NULL DEFAULT 0,
- num_uses_succeeded INTEGER NOT NULL DEFAULT 0
- );
- -- Core Snapshot table (0.8.x with UUID PK, status, crawl FK)
- CREATE TABLE IF NOT EXISTS core_snapshot (
- id CHAR(36) PRIMARY KEY,
- created_by_id INTEGER NOT NULL REFERENCES auth_user(id),
- created_at DATETIME NOT NULL,
- modified_at DATETIME,
- url VARCHAR(2000) NOT NULL,
- timestamp VARCHAR(32) NOT NULL UNIQUE,
- bookmarked_at DATETIME NOT NULL,
- crawl_id CHAR(36) REFERENCES crawls_crawl(id),
- title VARCHAR(512),
- downloaded_at DATETIME,
- depth SMALLINT UNSIGNED NOT NULL DEFAULT 0,
- retry_at DATETIME,
- status VARCHAR(16) NOT NULL DEFAULT 'queued',
- config TEXT DEFAULT '{}',
- notes TEXT NOT NULL DEFAULT '',
- output_dir VARCHAR(256),
- num_uses_failed INTEGER NOT NULL DEFAULT 0,
- num_uses_succeeded INTEGER NOT NULL DEFAULT 0
- );
- CREATE INDEX IF NOT EXISTS core_snapshot_url ON core_snapshot(url);
- CREATE INDEX IF NOT EXISTS core_snapshot_timestamp ON core_snapshot(timestamp);
- CREATE INDEX IF NOT EXISTS core_snapshot_created_at ON core_snapshot(created_at);
- -- Many-to-many for snapshot tags
- CREATE TABLE IF NOT EXISTS core_snapshot_tags (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- snapshot_id CHAR(36) NOT NULL REFERENCES core_snapshot(id),
- tag_id INTEGER NOT NULL REFERENCES core_tag(id),
- UNIQUE(snapshot_id, tag_id)
- );
- -- Core ArchiveResult table (0.8.x with AutoField PK + UUID, status)
- CREATE TABLE IF NOT EXISTS core_archiveresult (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- uuid CHAR(36) UNIQUE,
- created_by_id INTEGER NOT NULL REFERENCES auth_user(id),
- created_at DATETIME NOT NULL,
- modified_at DATETIME,
- snapshot_id CHAR(36) NOT NULL REFERENCES core_snapshot(id),
- extractor VARCHAR(32) NOT NULL,
- pwd VARCHAR(256),
- cmd TEXT,
- cmd_version VARCHAR(128),
- output VARCHAR(1024),
- start_ts DATETIME,
- end_ts DATETIME,
- status VARCHAR(16) NOT NULL DEFAULT 'queued',
- retry_at DATETIME,
- notes TEXT NOT NULL DEFAULT '',
- output_dir VARCHAR(256),
- iface_id INTEGER,
- config TEXT DEFAULT '{}',
- num_uses_failed INTEGER NOT NULL DEFAULT 0,
- num_uses_succeeded INTEGER NOT NULL DEFAULT 0
- );
- CREATE INDEX IF NOT EXISTS core_archiveresult_snapshot ON core_archiveresult(snapshot_id);
- CREATE INDEX IF NOT EXISTS core_archiveresult_extractor ON core_archiveresult(extractor);
- -- Insert required content types
- INSERT INTO django_content_type (app_label, model) VALUES
- ('contenttypes', 'contenttype'),
- ('auth', 'permission'),
- ('auth', 'group'),
- ('auth', 'user'),
- ('admin', 'logentry'),
- ('sessions', 'session'),
- ('core', 'snapshot'),
- ('core', 'archiveresult'),
- ('core', 'tag'),
- ('machine', 'machine'),
- ('machine', 'networkinterface'),
- ('machine', 'dependency'),
- ('machine', 'binary'),
- ('crawls', 'crawl'),
- ('crawls', 'crawlschedule'),
- ('crawls', 'seed'),
- ('api', 'apitoken'),
- ('api', 'outboundwebhook');
- """
- # =============================================================================
- # Test Data Generators
- # =============================================================================
- def generate_uuid() -> str:
- """Generate a UUID string without dashes for SQLite."""
- return uuid4().hex
- def generate_timestamp() -> str:
- """Generate a timestamp string like ArchiveBox uses."""
- return datetime.now(timezone.utc).strftime('%Y%m%d%H%M%S') + '.000000'
- def seed_0_4_data(db_path: Path) -> Dict[str, List[Dict]]:
- """Seed a 0.4.x database with realistic test data."""
- conn = sqlite3.connect(str(db_path))
- cursor = conn.cursor()
- created_data = {
- 'snapshots': [],
- 'tags_str': [],
- }
- test_urls = [
- ('https://example.com/page1', 'Example Page 1', 'news,tech'),
- ('https://example.org/article', 'Article Title', 'blog,reading'),
- ('https://github.com/user/repo', 'GitHub Repository', 'code,github'),
- ('https://news.ycombinator.com/item?id=12345', 'HN Discussion', 'news,discussion'),
- ('https://en.wikipedia.org/wiki/Test', 'Wikipedia Test', 'reference,wiki'),
- ]
- for i, (url, title, tags) in enumerate(test_urls):
- snapshot_id = generate_uuid()
- timestamp = f'2024010{i+1}120000.000000'
- added = f'2024-01-0{i+1} 12:00:00'
- cursor.execute("""
- INSERT INTO core_snapshot (id, url, timestamp, title, tags, added, updated)
- VALUES (?, ?, ?, ?, ?, ?, ?)
- """, (snapshot_id, url, timestamp, title, tags, added, added))
- created_data['snapshots'].append({
- 'id': snapshot_id,
- 'url': url,
- 'timestamp': timestamp,
- 'title': title,
- 'tags': tags,
- })
- created_data['tags_str'].append(tags)
- cursor.execute("""
- INSERT INTO django_migrations (app, name, applied)
- VALUES ('core', '0001_initial', datetime('now'))
- """)
- conn.commit()
- conn.close()
- return created_data
- def seed_0_7_data(db_path: Path) -> Dict[str, List[Dict]]:
- """Seed a 0.7.x database with realistic test data."""
- conn = sqlite3.connect(str(db_path))
- cursor = conn.cursor()
- created_data = {
- 'users': [],
- 'snapshots': [],
- 'tags': [],
- 'archiveresults': [],
- }
- # Create a user
- cursor.execute("""
- INSERT INTO auth_user (password, is_superuser, username, first_name, last_name,
- email, is_staff, is_active, date_joined)
- VALUES ('pbkdf2_sha256$test', 1, 'admin', 'Admin', 'User',
- '[email protected]', 1, 1, datetime('now'))
- """)
- user_id = cursor.lastrowid
- created_data['users'].append({'id': user_id, 'username': 'admin'})
- # Create 5 tags
- tag_names = ['news', 'tech', 'blog', 'reference', 'code']
- for name in tag_names:
- cursor.execute("""
- INSERT INTO core_tag (name, slug) VALUES (?, ?)
- """, (name, name.lower()))
- tag_id = cursor.lastrowid
- created_data['tags'].append({'id': tag_id, 'name': name, 'slug': name.lower()})
- # Create 5 snapshots
- test_urls = [
- ('https://example.com/page1', 'Example Page 1'),
- ('https://example.org/article', 'Article Title'),
- ('https://github.com/user/repo', 'GitHub Repository'),
- ('https://news.ycombinator.com/item?id=12345', 'HN Discussion'),
- ('https://en.wikipedia.org/wiki/Test', 'Wikipedia Test'),
- ]
- for i, (url, title) in enumerate(test_urls):
- snapshot_id = generate_uuid()
- timestamp = f'2024010{i+1}120000.000000'
- added = f'2024-01-0{i+1} 12:00:00'
- cursor.execute("""
- INSERT INTO core_snapshot (id, url, timestamp, title, added, updated)
- VALUES (?, ?, ?, ?, ?, ?)
- """, (snapshot_id, url, timestamp, title, added, added))
- created_data['snapshots'].append({
- 'id': snapshot_id,
- 'url': url,
- 'timestamp': timestamp,
- 'title': title,
- })
- # Assign 2 tags to each snapshot
- tag_ids = [created_data['tags'][i % 5]['id'], created_data['tags'][(i + 1) % 5]['id']]
- for tag_id in tag_ids:
- cursor.execute("""
- INSERT INTO core_snapshot_tags (snapshot_id, tag_id) VALUES (?, ?)
- """, (snapshot_id, tag_id))
- # Create 5 archive results for each snapshot
- extractors = ['title', 'favicon', 'screenshot', 'singlefile', 'wget']
- statuses = ['succeeded', 'succeeded', 'failed', 'succeeded', 'skipped']
- for j, (extractor, status) in enumerate(zip(extractors, statuses)):
- cursor.execute("""
- INSERT INTO core_archiveresult
- (snapshot_id, extractor, cmd, pwd, cmd_version, output, start_ts, end_ts, status)
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
- """, (
- snapshot_id, extractor,
- json.dumps([extractor, '--version']),
- f'/data/archive/{timestamp}',
- '1.0.0',
- f'{extractor}/index.html' if status == 'succeeded' else '',
- f'2024-01-0{i+1} 12:00:0{j}',
- f'2024-01-0{i+1} 12:00:1{j}',
- status
- ))
- created_data['archiveresults'].append({
- 'snapshot_id': snapshot_id,
- 'extractor': extractor,
- 'status': status,
- })
- # Record migrations as applied (0.7.x migrations up to 0022)
- migrations = [
- ('contenttypes', '0001_initial'),
- ('contenttypes', '0002_remove_content_type_name'),
- ('auth', '0001_initial'),
- ('auth', '0002_alter_permission_name_max_length'),
- ('auth', '0003_alter_user_email_max_length'),
- ('auth', '0004_alter_user_username_opts'),
- ('auth', '0005_alter_user_last_login_null'),
- ('auth', '0006_require_contenttypes_0002'),
- ('auth', '0007_alter_validators_add_error_messages'),
- ('auth', '0008_alter_user_username_max_length'),
- ('auth', '0009_alter_user_last_name_max_length'),
- ('auth', '0010_alter_group_name_max_length'),
- ('auth', '0011_update_proxy_permissions'),
- ('auth', '0012_alter_user_first_name_max_length'),
- ('admin', '0001_initial'),
- ('admin', '0002_logentry_remove_auto_add'),
- ('admin', '0003_logentry_add_action_flag_choices'),
- ('sessions', '0001_initial'),
- ('core', '0001_initial'),
- ('core', '0002_auto_20200625_1521'),
- ('core', '0003_auto_20200630_1034'),
- ('core', '0004_auto_20200713_1552'),
- ('core', '0005_auto_20200728_0326'),
- ('core', '0006_auto_20201012_1520'),
- ('core', '0007_archiveresult'),
- ('core', '0008_auto_20210105_1421'),
- ('core', '0009_auto_20210216_1038'),
- ('core', '0010_auto_20210216_1055'),
- ('core', '0011_auto_20210216_1331'),
- ('core', '0012_auto_20210216_1425'),
- ('core', '0013_auto_20210218_0729'),
- ('core', '0014_auto_20210218_0729'),
- ('core', '0015_auto_20210218_0730'),
- ('core', '0016_auto_20210218_1204'),
- ('core', '0017_auto_20210219_0211'),
- ('core', '0018_auto_20210327_0952'),
- ('core', '0019_auto_20210401_0654'),
- ('core', '0020_auto_20210410_1031'),
- ('core', '0021_auto_20220914_0934'),
- ('core', '0022_auto_20231023_2008'),
- ]
- for app, name in migrations:
- cursor.execute("""
- INSERT INTO django_migrations (app, name, applied)
- VALUES (?, ?, datetime('now'))
- """, (app, name))
- conn.commit()
- conn.close()
- return created_data
- def seed_0_8_data(db_path: Path) -> Dict[str, List[Dict]]:
- """Seed a 0.8.x database with realistic test data including Crawls."""
- conn = sqlite3.connect(str(db_path))
- cursor = conn.cursor()
- created_data = {
- 'users': [],
- 'crawls': [],
- 'snapshots': [],
- 'tags': [],
- 'archiveresults': [],
- }
- # Create a user
- cursor.execute("""
- INSERT INTO auth_user (password, is_superuser, username, first_name, last_name,
- email, is_staff, is_active, date_joined)
- VALUES ('pbkdf2_sha256$test', 1, 'admin', 'Admin', 'User',
- '[email protected]', 1, 1, datetime('now'))
- """)
- user_id = cursor.lastrowid
- created_data['users'].append({'id': user_id, 'username': 'admin'})
- # Create 5 tags
- tag_names = ['news', 'tech', 'blog', 'reference', 'code']
- for name in tag_names:
- cursor.execute("""
- INSERT INTO core_tag (name, slug, created_at, modified_at, created_by_id)
- VALUES (?, ?, datetime('now'), datetime('now'), ?)
- """, (name, name.lower(), user_id))
- tag_id = cursor.lastrowid
- created_data['tags'].append({'id': tag_id, 'name': name, 'slug': name.lower()})
- # Create 2 Crawls (0.9.0 schema - no seeds)
- test_crawls = [
- ('https://example.com\nhttps://example.org', 0, 'Example Crawl'),
- ('https://github.com/ArchiveBox', 1, 'GitHub Crawl'),
- ]
- for i, (urls, max_depth, label) in enumerate(test_crawls):
- crawl_id = generate_uuid()
- cursor.execute("""
- INSERT INTO crawls_crawl (id, created_at, created_by_id, modified_at, urls,
- config, max_depth, tags_str, label, status, retry_at,
- num_uses_failed, num_uses_succeeded)
- VALUES (?, datetime('now'), ?, datetime('now'), ?, '{}', ?, '', ?, 'queued', datetime('now'), 0, 0)
- """, (crawl_id, user_id, urls, max_depth, label))
- created_data['crawls'].append({
- 'id': crawl_id,
- 'urls': urls,
- 'max_depth': max_depth,
- 'label': label,
- })
- # Create 5 snapshots linked to crawls
- test_urls = [
- ('https://example.com/page1', 'Example Page 1', created_data['crawls'][0]['id']),
- ('https://example.org/article', 'Article Title', created_data['crawls'][0]['id']),
- ('https://github.com/user/repo', 'GitHub Repository', created_data['crawls'][1]['id']),
- ('https://news.ycombinator.com/item?id=12345', 'HN Discussion', None),
- ('https://en.wikipedia.org/wiki/Test', 'Wikipedia Test', None),
- ]
- for i, (url, title, crawl_id) in enumerate(test_urls):
- snapshot_id = generate_uuid()
- timestamp = f'2024010{i+1}120000.000000'
- created_at = f'2024-01-0{i+1} 12:00:00'
- cursor.execute("""
- INSERT INTO core_snapshot (id, created_by_id, created_at, modified_at, url, timestamp,
- bookmarked_at, crawl_id, title, depth, status, config, notes)
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, 0, 'queued', '{}', '')
- """, (snapshot_id, user_id, created_at, created_at, url, timestamp, created_at, crawl_id, title))
- created_data['snapshots'].append({
- 'id': snapshot_id,
- 'url': url,
- 'timestamp': timestamp,
- 'title': title,
- 'crawl_id': crawl_id,
- })
- # Assign 2 tags to each snapshot
- tag_ids = [created_data['tags'][i % 5]['id'], created_data['tags'][(i + 1) % 5]['id']]
- for tag_id in tag_ids:
- cursor.execute("""
- INSERT INTO core_snapshot_tags (snapshot_id, tag_id) VALUES (?, ?)
- """, (snapshot_id, tag_id))
- # Create 5 archive results for each snapshot
- extractors = ['title', 'favicon', 'screenshot', 'singlefile', 'wget']
- statuses = ['succeeded', 'succeeded', 'failed', 'succeeded', 'skipped']
- for j, (extractor, status) in enumerate(zip(extractors, statuses)):
- result_uuid = generate_uuid()
- cursor.execute("""
- INSERT INTO core_archiveresult
- (uuid, created_by_id, created_at, modified_at, snapshot_id, extractor, pwd,
- cmd, cmd_version, output, start_ts, end_ts, status, retry_at, notes, output_dir)
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now'), '', ?)
- """, (
- result_uuid, user_id, f'2024-01-0{i+1} 12:00:0{j}', f'2024-01-0{i+1} 12:00:1{j}',
- snapshot_id, extractor,
- f'/data/archive/{timestamp}',
- json.dumps([extractor, '--version']),
- '1.0.0',
- f'{extractor}/index.html' if status == 'succeeded' else '',
- f'2024-01-0{i+1} 12:00:0{j}',
- f'2024-01-0{i+1} 12:00:1{j}',
- status,
- f'{extractor}',
- ))
- created_data['archiveresults'].append({
- 'uuid': result_uuid,
- 'snapshot_id': snapshot_id,
- 'extractor': extractor,
- 'status': status,
- })
- # Record migrations as applied (0.8.x migrations)
- migrations = [
- ('contenttypes', '0001_initial'),
- ('contenttypes', '0002_remove_content_type_name'),
- ('auth', '0001_initial'),
- ('auth', '0002_alter_permission_name_max_length'),
- ('auth', '0003_alter_user_email_max_length'),
- ('auth', '0004_alter_user_username_opts'),
- ('auth', '0005_alter_user_last_login_null'),
- ('auth', '0006_require_contenttypes_0002'),
- ('auth', '0007_alter_validators_add_error_messages'),
- ('auth', '0008_alter_user_username_max_length'),
- ('auth', '0009_alter_user_last_name_max_length'),
- ('auth', '0010_alter_group_name_max_length'),
- ('auth', '0011_update_proxy_permissions'),
- ('auth', '0012_alter_user_first_name_max_length'),
- ('admin', '0001_initial'),
- ('admin', '0002_logentry_remove_auto_add'),
- ('admin', '0003_logentry_add_action_flag_choices'),
- ('sessions', '0001_initial'),
- ('core', '0001_initial'),
- ('core', '0002_auto_20200625_1521'),
- ('core', '0003_auto_20200630_1034'),
- ('core', '0004_auto_20200713_1552'),
- ('core', '0005_auto_20200728_0326'),
- ('core', '0006_auto_20201012_1520'),
- ('core', '0007_archiveresult'),
- ('core', '0008_auto_20210105_1421'),
- ('core', '0009_auto_20210216_1038'),
- ('core', '0010_auto_20210216_1055'),
- ('core', '0011_auto_20210216_1331'),
- ('core', '0012_auto_20210216_1425'),
- ('core', '0013_auto_20210218_0729'),
- ('core', '0014_auto_20210218_0729'),
- ('core', '0015_auto_20210218_0730'),
- ('core', '0016_auto_20210218_1204'),
- ('core', '0017_auto_20210219_0211'),
- ('core', '0018_auto_20210327_0952'),
- ('core', '0019_auto_20210401_0654'),
- ('core', '0020_auto_20210410_1031'),
- ('core', '0021_auto_20220914_0934'),
- ('core', '0022_auto_20231023_2008'),
- # For 0.8.x (dev branch), record the migrations that 0023_new_schema replaces
- ('core', '0023_alter_archiveresult_options_archiveresult_abid_and_more'),
- ('core', '0024_auto_20240513_1143'),
- ('core', '0025_alter_archiveresult_uuid'),
- ('core', '0026_archiveresult_created_archiveresult_created_by_and_more'),
- ('core', '0027_update_snapshot_ids'),
- ('core', '0028_alter_archiveresult_uuid'),
- ('core', '0029_alter_archiveresult_id'),
- ('core', '0030_alter_archiveresult_uuid'),
- ('core', '0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more'),
- ('core', '0032_alter_archiveresult_id'),
- ('core', '0033_rename_id_archiveresult_old_id'),
- ('core', '0034_alter_archiveresult_old_id_alter_archiveresult_uuid'),
- ('core', '0035_remove_archiveresult_uuid_archiveresult_id'),
- ('core', '0036_alter_archiveresult_id_alter_archiveresult_old_id'),
- ('core', '0037_rename_id_snapshot_old_id'),
- ('core', '0038_rename_uuid_snapshot_id'),
- ('core', '0039_rename_snapshot_archiveresult_snapshot_old'),
- ('core', '0040_archiveresult_snapshot'),
- ('core', '0041_alter_archiveresult_snapshot_and_more'),
- ('core', '0042_remove_archiveresult_snapshot_old'),
- ('core', '0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
- ('core', '0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more'),
- ('core', '0045_alter_snapshot_old_id'),
- ('core', '0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
- ('core', '0047_alter_snapshottag_unique_together_and_more'),
- ('core', '0048_alter_archiveresult_snapshot_and_more'),
- ('core', '0049_rename_snapshot_snapshottag_snapshot_old_and_more'),
- ('core', '0050_alter_snapshottag_snapshot_old'),
- ('core', '0051_snapshottag_snapshot_alter_snapshottag_snapshot_old'),
- ('core', '0052_alter_snapshottag_unique_together_and_more'),
- ('core', '0053_remove_snapshottag_snapshot_old'),
- ('core', '0054_alter_snapshot_timestamp'),
- ('core', '0055_alter_tag_slug'),
- ('core', '0056_remove_tag_uuid'),
- ('core', '0057_rename_id_tag_old_id'),
- ('core', '0058_alter_tag_old_id'),
- ('core', '0059_tag_id'),
- ('core', '0060_alter_tag_id'),
- ('core', '0061_rename_tag_snapshottag_old_tag_and_more'),
- ('core', '0062_alter_snapshottag_old_tag'),
- ('core', '0063_snapshottag_tag_alter_snapshottag_old_tag'),
- ('core', '0064_alter_snapshottag_unique_together_and_more'),
- ('core', '0065_remove_snapshottag_old_tag'),
- ('core', '0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id'),
- ('core', '0067_alter_snapshottag_tag'),
- ('core', '0068_alter_archiveresult_options'),
- ('core', '0069_alter_archiveresult_created_alter_snapshot_added_and_more'),
- ('core', '0070_alter_archiveresult_created_by_alter_snapshot_added_and_more'),
- ('core', '0071_remove_archiveresult_old_id_remove_snapshot_old_id_and_more'),
- ('core', '0072_rename_added_snapshot_bookmarked_at_and_more'),
- ('core', '0073_rename_created_archiveresult_created_at_and_more'),
- ('core', '0074_alter_snapshot_downloaded_at'),
- # For 0.8.x: DO NOT record 0023_new_schema - it replaces 0023-0074 for fresh installs
- # We already recorded 0023-0074 above, so Django will know the state
- # For 0.8.x: Record original machine migrations (before squashing)
- # DO NOT record 0001_squashed here - it replaces 0001-0004 for fresh installs
- ('machine', '0001_initial'),
- ('machine', '0002_alter_machine_stats_installedbinary'),
- ('machine', '0003_alter_installedbinary_options_and_more'),
- ('machine', '0004_alter_installedbinary_abspath_and_more'),
- # Then the new migrations after squashing
- ('machine', '0002_rename_custom_cmds_to_overrides'),
- ('machine', '0003_alter_dependency_id_alter_installedbinary_dependency_and_more'),
- ('machine', '0004_drop_dependency_table'),
- # Crawls must come before core.0024 because 0024_b depends on it
- ('crawls', '0001_initial'),
- # Core 0024 migrations chain (in dependency order)
- ('core', '0024_b_clear_config_fields'),
- ('core', '0024_c_disable_fk_checks'),
- ('core', '0024_d_fix_crawls_config'),
- ('core', '0024_snapshot_crawl'),
- ('core', '0024_f_add_snapshot_config'),
- ('core', '0025_allow_duplicate_urls_per_crawl'),
- # For 0.8.x: Record original api migration (before squashing)
- # DO NOT record 0001_squashed here - it replaces 0001 for fresh installs
- ('api', '0001_initial'),
- ('api', '0002_alter_apitoken_options'),
- ('api', '0003_rename_user_apitoken_created_by_apitoken_abid_and_more'),
- ('api', '0004_alter_apitoken_id_alter_apitoken_uuid'),
- ('api', '0005_remove_apitoken_uuid_remove_outboundwebhook_uuid_and_more'),
- ('api', '0006_remove_outboundwebhook_uuid_apitoken_id_and_more'),
- ('api', '0007_alter_apitoken_created_by'),
- ('api', '0008_alter_apitoken_created_alter_apitoken_created_by_and_more'),
- ('api', '0009_rename_created_apitoken_created_at_and_more'),
- # Note: crawls.0001_initial moved earlier (before core.0024) due to dependencies
- # Stop here - 0.8.x ends at core.0025, crawls.0001, and we want to TEST the later migrations
- # Do NOT record 0026+ as they need to be tested during migration
- ]
- for app, name in migrations:
- cursor.execute("""
- INSERT INTO django_migrations (app, name, applied)
- VALUES (?, ?, datetime('now'))
- """, (app, name))
- conn.commit()
- conn.close()
- return created_data
- # =============================================================================
- # Helper Functions
- # =============================================================================
- def run_archivebox(data_dir: Path, args: list, timeout: int = 60, env: dict = None) -> subprocess.CompletedProcess:
- """Run archivebox command in subprocess with given data directory."""
- base_env = os.environ.copy()
- base_env['DATA_DIR'] = str(data_dir)
- base_env['USE_COLOR'] = 'False'
- base_env['SHOW_PROGRESS'] = 'False'
- # Disable ALL extractors for faster tests (can be overridden by env parameter)
- base_env['SAVE_ARCHIVEDOTORG'] = 'False'
- base_env['SAVE_TITLE'] = 'False'
- base_env['SAVE_FAVICON'] = 'False'
- base_env['SAVE_WGET'] = 'False'
- base_env['SAVE_SINGLEFILE'] = 'False'
- base_env['SAVE_SCREENSHOT'] = 'False'
- base_env['SAVE_PDF'] = 'False'
- base_env['SAVE_DOM'] = 'False'
- base_env['SAVE_READABILITY'] = 'False'
- base_env['SAVE_MERCURY'] = 'False'
- base_env['SAVE_GIT'] = 'False'
- base_env['SAVE_YTDLP'] = 'False'
- base_env['SAVE_HEADERS'] = 'False'
- base_env['SAVE_HTMLTOTEXT'] = 'False'
- # Override with any custom env vars
- if env:
- base_env.update(env)
- cmd = [sys.executable, '-m', 'archivebox'] + args
- return subprocess.run(
- cmd,
- capture_output=True,
- text=True,
- env=base_env,
- cwd=str(data_dir),
- timeout=timeout,
- )
- def create_data_dir_structure(data_dir: Path):
- """Create the basic ArchiveBox data directory structure."""
- (data_dir / 'archive').mkdir(parents=True, exist_ok=True)
- (data_dir / 'sources').mkdir(parents=True, exist_ok=True)
- (data_dir / 'logs').mkdir(parents=True, exist_ok=True)
- def verify_snapshot_count(db_path: Path, expected: int) -> Tuple[bool, str]:
- """Verify the number of snapshots in the database."""
- conn = sqlite3.connect(str(db_path))
- cursor = conn.cursor()
- cursor.execute("SELECT COUNT(*) FROM core_snapshot")
- count = cursor.fetchone()[0]
- conn.close()
- if count == expected:
- return True, f"Snapshot count OK: {count}"
- return False, f"Snapshot count mismatch: expected {expected}, got {count}"
- def verify_tag_count(db_path: Path, expected: int) -> Tuple[bool, str]:
- """Verify the number of tags in the database (exact match)."""
- conn = sqlite3.connect(str(db_path))
- cursor = conn.cursor()
- cursor.execute("SELECT COUNT(*) FROM core_tag")
- count = cursor.fetchone()[0]
- conn.close()
- if count == expected:
- return True, f"Tag count OK: {count}"
- return False, f"Tag count mismatch: expected {expected}, got {count}"
- def verify_archiveresult_count(db_path: Path, expected: int) -> Tuple[bool, str]:
- """Verify the number of archive results in the database."""
- conn = sqlite3.connect(str(db_path))
- cursor = conn.cursor()
- cursor.execute("SELECT COUNT(*) FROM core_archiveresult")
- count = cursor.fetchone()[0]
- conn.close()
- if count == expected:
- return True, f"ArchiveResult count OK: {count}"
- return False, f"ArchiveResult count mismatch: expected {expected}, got {count}"
- def verify_snapshot_urls(db_path: Path, expected_urls: List[str]) -> Tuple[bool, str]:
- """Verify ALL expected URLs exist in snapshots."""
- conn = sqlite3.connect(str(db_path))
- cursor = conn.cursor()
- cursor.execute("SELECT url FROM core_snapshot")
- actual_urls = {row[0] for row in cursor.fetchall()}
- conn.close()
- missing = set(expected_urls) - actual_urls
- if not missing:
- return True, "All URLs preserved"
- return False, f"Missing URLs: {missing}"
- def verify_snapshot_titles(db_path: Path, expected_titles: Dict[str, str]) -> Tuple[bool, str]:
- """Verify ALL snapshot titles are preserved."""
- conn = sqlite3.connect(str(db_path))
- cursor = conn.cursor()
- cursor.execute("SELECT url, title FROM core_snapshot")
- actual = {row[0]: row[1] for row in cursor.fetchall()}
- conn.close()
- mismatches = []
- for url, expected_title in expected_titles.items():
- if url not in actual:
- mismatches.append(f"{url}: missing from database")
- elif actual[url] != expected_title:
- mismatches.append(f"{url}: expected '{expected_title}', got '{actual[url]}'")
- if not mismatches:
- return True, "All titles preserved"
- return False, f"Title mismatches: {mismatches}"
- def verify_foreign_keys(db_path: Path) -> Tuple[bool, str]:
- """Verify foreign key relationships are intact."""
- conn = sqlite3.connect(str(db_path))
- cursor = conn.cursor()
- # Check ArchiveResult -> Snapshot FK
- cursor.execute("""
- SELECT COUNT(*) FROM core_archiveresult ar
- WHERE NOT EXISTS (SELECT 1 FROM core_snapshot s WHERE s.id = ar.snapshot_id)
- """)
- orphaned_results = cursor.fetchone()[0]
- conn.close()
- if orphaned_results == 0:
- return True, "Foreign keys intact"
- return False, f"Found {orphaned_results} orphaned ArchiveResults"
- def verify_all_snapshots_in_output(output: str, snapshots: List[Dict]) -> Tuple[bool, str]:
- """Verify ALL snapshots appear in command output (not just one)."""
- missing = []
- for snapshot in snapshots:
- url_fragment = snapshot['url'][:30]
- title = snapshot.get('title', '')
- if url_fragment not in output and (not title or title not in output):
- missing.append(snapshot['url'])
- if not missing:
- return True, "All snapshots found in output"
- return False, f"Missing snapshots in output: {missing}"
- def verify_crawl_count(db_path: Path, expected: int) -> Tuple[bool, str]:
- """Verify the number of crawls in the database."""
- conn = sqlite3.connect(str(db_path))
- cursor = conn.cursor()
- cursor.execute("SELECT COUNT(*) FROM crawls_crawl")
- count = cursor.fetchone()[0]
- conn.close()
- if count == expected:
- return True, f"Crawl count OK: {count}"
- return False, f"Crawl count mismatch: expected {expected}, got {count}"
- def verify_process_migration(db_path: Path, expected_archiveresult_count: int) -> Tuple[bool, str]:
- """
- Verify that ArchiveResults were properly migrated to Process records.
- Checks:
- 1. All ArchiveResults have process_id set
- 2. Process count matches ArchiveResult count
- 3. Binary records created for unique cmd_version values
- 4. Status mapping is correct
- """
- conn = sqlite3.connect(str(db_path))
- cursor = conn.cursor()
- # Check all ArchiveResults have process_id
- cursor.execute("SELECT COUNT(*) FROM core_archiveresult WHERE process_id IS NULL")
- null_count = cursor.fetchone()[0]
- if null_count > 0:
- conn.close()
- return False, f"Found {null_count} ArchiveResults without process_id"
- # Check Process count
- cursor.execute("SELECT COUNT(*) FROM machine_process")
- process_count = cursor.fetchone()[0]
- if process_count != expected_archiveresult_count:
- conn.close()
- return False, f"Expected {expected_archiveresult_count} Processes, got {process_count}"
- # Check status mapping
- cursor.execute("""
- SELECT ar.status, p.status, p.exit_code
- FROM core_archiveresult ar
- JOIN machine_process p ON ar.process_id = p.id
- """)
- status_errors = []
- for ar_status, p_status, p_exit_code in cursor.fetchall():
- expected_p_status, expected_exit_code = {
- 'queued': ('queued', None),
- 'started': ('running', None),
- 'backoff': ('queued', None),
- 'succeeded': ('exited', 0),
- 'failed': ('exited', 1),
- 'skipped': ('exited', None),
- }.get(ar_status, ('queued', None))
- if p_status != expected_p_status:
- status_errors.append(f"AR status {ar_status} → Process {p_status}, expected {expected_p_status}")
- if p_exit_code != expected_exit_code:
- status_errors.append(f"AR status {ar_status} → exit_code {p_exit_code}, expected {expected_exit_code}")
- if status_errors:
- conn.close()
- return False, f"Status mapping errors: {'; '.join(status_errors[:5])}"
- conn.close()
- return True, f"Process migration verified: {process_count} Processes created"
|