test_migrations_helpers.py 44 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191
  1. #!/usr/bin/env python3
  2. """
  3. Helper functions and schema definitions for migration tests.
  4. This module provides:
  5. - Schema definitions for each major ArchiveBox version (0.4.x, 0.7.x, 0.8.x)
  6. - Data seeding functions to populate test databases
  7. - Helper functions to run archivebox commands and verify results
  8. """
  9. import os
  10. import sys
  11. import json
  12. import sqlite3
  13. import subprocess
  14. from pathlib import Path
  15. from datetime import datetime, timezone
  16. from typing import Dict, List, Tuple
  17. from uuid import uuid4
  18. # =============================================================================
  19. # Schema Definitions for Each Version
  20. # =============================================================================
  21. SCHEMA_0_4 = """
  22. -- Django system tables (minimal)
  23. CREATE TABLE IF NOT EXISTS django_migrations (
  24. id INTEGER PRIMARY KEY AUTOINCREMENT,
  25. app VARCHAR(255) NOT NULL,
  26. name VARCHAR(255) NOT NULL,
  27. applied DATETIME NOT NULL
  28. );
  29. -- Core tables for 0.4.x
  30. CREATE TABLE IF NOT EXISTS core_snapshot (
  31. id CHAR(32) PRIMARY KEY,
  32. url VARCHAR(2000) NOT NULL UNIQUE,
  33. timestamp VARCHAR(32) NOT NULL UNIQUE,
  34. title VARCHAR(128),
  35. tags VARCHAR(256),
  36. added DATETIME NOT NULL,
  37. updated DATETIME
  38. );
  39. CREATE INDEX IF NOT EXISTS core_snapshot_url ON core_snapshot(url);
  40. CREATE INDEX IF NOT EXISTS core_snapshot_timestamp ON core_snapshot(timestamp);
  41. CREATE INDEX IF NOT EXISTS core_snapshot_added ON core_snapshot(added);
  42. """
  43. SCHEMA_0_7 = """
  44. -- Django system tables (complete for 0.7.x)
  45. CREATE TABLE IF NOT EXISTS django_migrations (
  46. id INTEGER PRIMARY KEY AUTOINCREMENT,
  47. app VARCHAR(255) NOT NULL,
  48. name VARCHAR(255) NOT NULL,
  49. applied DATETIME NOT NULL
  50. );
  51. CREATE TABLE IF NOT EXISTS django_content_type (
  52. id INTEGER PRIMARY KEY AUTOINCREMENT,
  53. app_label VARCHAR(100) NOT NULL,
  54. model VARCHAR(100) NOT NULL,
  55. UNIQUE(app_label, model)
  56. );
  57. CREATE TABLE IF NOT EXISTS auth_permission (
  58. id INTEGER PRIMARY KEY AUTOINCREMENT,
  59. name VARCHAR(255) NOT NULL,
  60. content_type_id INTEGER NOT NULL REFERENCES django_content_type(id),
  61. codename VARCHAR(100) NOT NULL,
  62. UNIQUE(content_type_id, codename)
  63. );
  64. CREATE TABLE IF NOT EXISTS auth_group (
  65. id INTEGER PRIMARY KEY AUTOINCREMENT,
  66. name VARCHAR(150) NOT NULL UNIQUE
  67. );
  68. CREATE TABLE IF NOT EXISTS auth_group_permissions (
  69. id INTEGER PRIMARY KEY AUTOINCREMENT,
  70. group_id INTEGER NOT NULL REFERENCES auth_group(id),
  71. permission_id INTEGER NOT NULL REFERENCES auth_permission(id),
  72. UNIQUE(group_id, permission_id)
  73. );
  74. CREATE TABLE IF NOT EXISTS auth_user (
  75. id INTEGER PRIMARY KEY AUTOINCREMENT,
  76. password VARCHAR(128) NOT NULL,
  77. last_login DATETIME,
  78. is_superuser BOOL NOT NULL,
  79. username VARCHAR(150) NOT NULL UNIQUE,
  80. first_name VARCHAR(150) NOT NULL,
  81. last_name VARCHAR(150) NOT NULL,
  82. email VARCHAR(254) NOT NULL,
  83. is_staff BOOL NOT NULL,
  84. is_active BOOL NOT NULL,
  85. date_joined DATETIME NOT NULL
  86. );
  87. CREATE TABLE IF NOT EXISTS auth_user_groups (
  88. id INTEGER PRIMARY KEY AUTOINCREMENT,
  89. user_id INTEGER NOT NULL REFERENCES auth_user(id),
  90. group_id INTEGER NOT NULL REFERENCES auth_group(id),
  91. UNIQUE(user_id, group_id)
  92. );
  93. CREATE TABLE IF NOT EXISTS auth_user_user_permissions (
  94. id INTEGER PRIMARY KEY AUTOINCREMENT,
  95. user_id INTEGER NOT NULL REFERENCES auth_user(id),
  96. permission_id INTEGER NOT NULL REFERENCES auth_permission(id),
  97. UNIQUE(user_id, permission_id)
  98. );
  99. CREATE TABLE IF NOT EXISTS django_admin_log (
  100. id INTEGER PRIMARY KEY AUTOINCREMENT,
  101. action_time DATETIME NOT NULL,
  102. object_id TEXT,
  103. object_repr VARCHAR(200) NOT NULL,
  104. action_flag SMALLINT UNSIGNED NOT NULL,
  105. change_message TEXT NOT NULL,
  106. content_type_id INTEGER REFERENCES django_content_type(id),
  107. user_id INTEGER NOT NULL REFERENCES auth_user(id)
  108. );
  109. CREATE TABLE IF NOT EXISTS django_session (
  110. session_key VARCHAR(40) NOT NULL PRIMARY KEY,
  111. session_data TEXT NOT NULL,
  112. expire_date DATETIME NOT NULL
  113. );
  114. -- Core tables for 0.7.x
  115. CREATE TABLE IF NOT EXISTS core_tag (
  116. id INTEGER PRIMARY KEY AUTOINCREMENT,
  117. name VARCHAR(100) NOT NULL UNIQUE,
  118. slug VARCHAR(100) NOT NULL UNIQUE
  119. );
  120. CREATE TABLE IF NOT EXISTS core_snapshot (
  121. id CHAR(32) PRIMARY KEY,
  122. url VARCHAR(2000) NOT NULL UNIQUE,
  123. timestamp VARCHAR(32) NOT NULL UNIQUE,
  124. title VARCHAR(512),
  125. added DATETIME NOT NULL,
  126. updated DATETIME
  127. );
  128. CREATE INDEX IF NOT EXISTS core_snapshot_url ON core_snapshot(url);
  129. CREATE INDEX IF NOT EXISTS core_snapshot_timestamp ON core_snapshot(timestamp);
  130. CREATE INDEX IF NOT EXISTS core_snapshot_added ON core_snapshot(added);
  131. -- Many-to-many for snapshot tags
  132. CREATE TABLE IF NOT EXISTS core_snapshot_tags (
  133. id INTEGER PRIMARY KEY AUTOINCREMENT,
  134. snapshot_id CHAR(32) NOT NULL REFERENCES core_snapshot(id),
  135. tag_id INTEGER NOT NULL REFERENCES core_tag(id),
  136. UNIQUE(snapshot_id, tag_id)
  137. );
  138. CREATE TABLE IF NOT EXISTS core_archiveresult (
  139. id INTEGER PRIMARY KEY AUTOINCREMENT,
  140. snapshot_id CHAR(32) NOT NULL REFERENCES core_snapshot(id),
  141. extractor VARCHAR(32) NOT NULL,
  142. cmd TEXT,
  143. pwd VARCHAR(256),
  144. cmd_version VARCHAR(128),
  145. output VARCHAR(1024),
  146. start_ts DATETIME,
  147. end_ts DATETIME,
  148. status VARCHAR(16) NOT NULL
  149. );
  150. CREATE INDEX IF NOT EXISTS core_archiveresult_snapshot ON core_archiveresult(snapshot_id);
  151. CREATE INDEX IF NOT EXISTS core_archiveresult_extractor ON core_archiveresult(extractor);
  152. -- Insert required content types
  153. INSERT INTO django_content_type (app_label, model) VALUES
  154. ('contenttypes', 'contenttype'),
  155. ('auth', 'permission'),
  156. ('auth', 'group'),
  157. ('auth', 'user'),
  158. ('admin', 'logentry'),
  159. ('sessions', 'session'),
  160. ('core', 'snapshot'),
  161. ('core', 'archiveresult'),
  162. ('core', 'tag');
  163. """
  164. SCHEMA_0_8 = """
  165. -- Django system tables (complete for 0.8.x)
  166. CREATE TABLE IF NOT EXISTS django_migrations (
  167. id INTEGER PRIMARY KEY AUTOINCREMENT,
  168. app VARCHAR(255) NOT NULL,
  169. name VARCHAR(255) NOT NULL,
  170. applied DATETIME NOT NULL
  171. );
  172. CREATE TABLE IF NOT EXISTS django_content_type (
  173. id INTEGER PRIMARY KEY AUTOINCREMENT,
  174. app_label VARCHAR(100) NOT NULL,
  175. model VARCHAR(100) NOT NULL,
  176. UNIQUE(app_label, model)
  177. );
  178. CREATE TABLE IF NOT EXISTS auth_permission (
  179. id INTEGER PRIMARY KEY AUTOINCREMENT,
  180. name VARCHAR(255) NOT NULL,
  181. content_type_id INTEGER NOT NULL REFERENCES django_content_type(id),
  182. codename VARCHAR(100) NOT NULL,
  183. UNIQUE(content_type_id, codename)
  184. );
  185. CREATE TABLE IF NOT EXISTS auth_group (
  186. id INTEGER PRIMARY KEY AUTOINCREMENT,
  187. name VARCHAR(150) NOT NULL UNIQUE
  188. );
  189. CREATE TABLE IF NOT EXISTS auth_group_permissions (
  190. id INTEGER PRIMARY KEY AUTOINCREMENT,
  191. group_id INTEGER NOT NULL REFERENCES auth_group(id),
  192. permission_id INTEGER NOT NULL REFERENCES auth_permission(id),
  193. UNIQUE(group_id, permission_id)
  194. );
  195. CREATE TABLE IF NOT EXISTS auth_user (
  196. id INTEGER PRIMARY KEY AUTOINCREMENT,
  197. password VARCHAR(128) NOT NULL,
  198. last_login DATETIME,
  199. is_superuser BOOL NOT NULL,
  200. username VARCHAR(150) NOT NULL UNIQUE,
  201. first_name VARCHAR(150) NOT NULL,
  202. last_name VARCHAR(150) NOT NULL,
  203. email VARCHAR(254) NOT NULL,
  204. is_staff BOOL NOT NULL,
  205. is_active BOOL NOT NULL,
  206. date_joined DATETIME NOT NULL
  207. );
  208. CREATE TABLE IF NOT EXISTS auth_user_groups (
  209. id INTEGER PRIMARY KEY AUTOINCREMENT,
  210. user_id INTEGER NOT NULL REFERENCES auth_user(id),
  211. group_id INTEGER NOT NULL REFERENCES auth_group(id),
  212. UNIQUE(user_id, group_id)
  213. );
  214. CREATE TABLE IF NOT EXISTS auth_user_user_permissions (
  215. id INTEGER PRIMARY KEY AUTOINCREMENT,
  216. user_id INTEGER NOT NULL REFERENCES auth_user(id),
  217. permission_id INTEGER NOT NULL REFERENCES auth_permission(id),
  218. UNIQUE(user_id, permission_id)
  219. );
  220. CREATE TABLE IF NOT EXISTS django_admin_log (
  221. id INTEGER PRIMARY KEY AUTOINCREMENT,
  222. action_time DATETIME NOT NULL,
  223. object_id TEXT,
  224. object_repr VARCHAR(200) NOT NULL,
  225. action_flag SMALLINT UNSIGNED NOT NULL,
  226. change_message TEXT NOT NULL,
  227. content_type_id INTEGER REFERENCES django_content_type(id),
  228. user_id INTEGER NOT NULL REFERENCES auth_user(id)
  229. );
  230. CREATE TABLE IF NOT EXISTS django_session (
  231. session_key VARCHAR(40) NOT NULL PRIMARY KEY,
  232. session_data TEXT NOT NULL,
  233. expire_date DATETIME NOT NULL
  234. );
  235. -- Machine app tables (added in 0.8.x)
  236. CREATE TABLE IF NOT EXISTS machine_machine (
  237. id CHAR(36) PRIMARY KEY,
  238. created_at DATETIME NOT NULL,
  239. modified_at DATETIME,
  240. guid VARCHAR(64) NOT NULL UNIQUE,
  241. hostname VARCHAR(63),
  242. hw_in_docker BOOLEAN NOT NULL DEFAULT 0,
  243. hw_in_vm BOOLEAN NOT NULL DEFAULT 0,
  244. hw_manufacturer VARCHAR(63),
  245. hw_product VARCHAR(63),
  246. hw_uuid VARCHAR(255),
  247. os_arch VARCHAR(15),
  248. os_family VARCHAR(15),
  249. os_platform VARCHAR(63),
  250. os_release VARCHAR(63),
  251. os_kernel VARCHAR(255),
  252. stats TEXT DEFAULT '{}',
  253. config TEXT DEFAULT '{}',
  254. num_uses_failed INTEGER NOT NULL DEFAULT 0,
  255. num_uses_succeeded INTEGER NOT NULL DEFAULT 0
  256. );
  257. CREATE TABLE IF NOT EXISTS machine_networkinterface (
  258. id CHAR(36) PRIMARY KEY,
  259. created_at DATETIME NOT NULL,
  260. modified_at DATETIME,
  261. machine_id CHAR(36) NOT NULL REFERENCES machine_machine(id),
  262. mac_address VARCHAR(17),
  263. ip_public VARCHAR(45),
  264. ip_local VARCHAR(45),
  265. dns_server VARCHAR(45),
  266. hostname VARCHAR(63),
  267. iface VARCHAR(15),
  268. isp VARCHAR(63),
  269. city VARCHAR(63),
  270. region VARCHAR(63),
  271. country VARCHAR(63),
  272. num_uses_failed INTEGER NOT NULL DEFAULT 0,
  273. num_uses_succeeded INTEGER NOT NULL DEFAULT 0
  274. );
  275. CREATE TABLE IF NOT EXISTS machine_dependency (
  276. id CHAR(36) PRIMARY KEY,
  277. created_at DATETIME NOT NULL,
  278. modified_at DATETIME,
  279. bin_name VARCHAR(63) NOT NULL UNIQUE,
  280. bin_providers VARCHAR(127) NOT NULL DEFAULT '*',
  281. custom_cmds TEXT DEFAULT '{}',
  282. config TEXT DEFAULT '{}'
  283. );
  284. CREATE TABLE IF NOT EXISTS machine_binary (
  285. id CHAR(36) PRIMARY KEY,
  286. created_at DATETIME NOT NULL,
  287. modified_at DATETIME,
  288. machine_id CHAR(36) REFERENCES machine_machine(id),
  289. dependency_id CHAR(36) REFERENCES machine_dependency(id),
  290. name VARCHAR(63),
  291. binprovider VARCHAR(31),
  292. abspath VARCHAR(255),
  293. version VARCHAR(32),
  294. sha256 VARCHAR(64),
  295. num_uses_failed INTEGER NOT NULL DEFAULT 0,
  296. num_uses_succeeded INTEGER NOT NULL DEFAULT 0
  297. );
  298. -- API app tables (added in 0.8.x)
  299. CREATE TABLE IF NOT EXISTS api_apitoken (
  300. id CHAR(36) PRIMARY KEY,
  301. created_by_id INTEGER NOT NULL REFERENCES auth_user(id),
  302. created_at DATETIME NOT NULL,
  303. modified_at DATETIME,
  304. token VARCHAR(32) NOT NULL UNIQUE,
  305. expires DATETIME
  306. );
  307. CREATE TABLE IF NOT EXISTS api_outboundwebhook (
  308. id CHAR(36) PRIMARY KEY,
  309. created_by_id INTEGER NOT NULL REFERENCES auth_user(id),
  310. created_at DATETIME NOT NULL,
  311. modified_at DATETIME,
  312. name VARCHAR(255) NOT NULL DEFAULT '',
  313. signal VARCHAR(255) NOT NULL,
  314. ref VARCHAR(255) NOT NULL,
  315. endpoint VARCHAR(2083) NOT NULL,
  316. headers TEXT DEFAULT '{}',
  317. auth_token VARCHAR(4000) NOT NULL DEFAULT '',
  318. enabled BOOLEAN NOT NULL DEFAULT 1,
  319. keep_last_response BOOLEAN NOT NULL DEFAULT 0,
  320. last_response TEXT NOT NULL DEFAULT '',
  321. last_success DATETIME,
  322. last_failure DATETIME,
  323. num_uses_failed INTEGER NOT NULL DEFAULT 0,
  324. num_uses_succeeded INTEGER NOT NULL DEFAULT 0
  325. );
  326. -- Core Tag table (AutoField PK in 0.8.x)
  327. CREATE TABLE IF NOT EXISTS core_tag (
  328. id INTEGER PRIMARY KEY AUTOINCREMENT,
  329. name VARCHAR(100) NOT NULL UNIQUE,
  330. slug VARCHAR(100) NOT NULL UNIQUE,
  331. created_at DATETIME,
  332. modified_at DATETIME,
  333. created_by_id INTEGER REFERENCES auth_user(id)
  334. );
  335. -- Crawls tables (new in 0.8.x)
  336. CREATE TABLE IF NOT EXISTS crawls_crawlschedule (
  337. id CHAR(36) PRIMARY KEY,
  338. created_at DATETIME NOT NULL,
  339. created_by_id INTEGER NOT NULL REFERENCES auth_user(id),
  340. modified_at DATETIME,
  341. schedule VARCHAR(64) NOT NULL,
  342. is_enabled BOOLEAN NOT NULL DEFAULT 1,
  343. label VARCHAR(64) NOT NULL DEFAULT '',
  344. notes TEXT NOT NULL DEFAULT '',
  345. template_id CHAR(36) REFERENCES crawls_crawl(id),
  346. num_uses_failed INTEGER NOT NULL DEFAULT 0,
  347. num_uses_succeeded INTEGER NOT NULL DEFAULT 0
  348. );
  349. CREATE TABLE IF NOT EXISTS crawls_crawl (
  350. id CHAR(36) PRIMARY KEY,
  351. created_at DATETIME NOT NULL,
  352. created_by_id INTEGER NOT NULL REFERENCES auth_user(id),
  353. modified_at DATETIME,
  354. urls TEXT NOT NULL,
  355. config TEXT DEFAULT '{}',
  356. max_depth SMALLINT UNSIGNED NOT NULL DEFAULT 0,
  357. tags_str VARCHAR(1024) NOT NULL DEFAULT '',
  358. persona_id CHAR(36),
  359. label VARCHAR(64) NOT NULL DEFAULT '',
  360. notes TEXT NOT NULL DEFAULT '',
  361. schedule_id CHAR(36),
  362. output_dir VARCHAR(256) NOT NULL DEFAULT '',
  363. status VARCHAR(16) NOT NULL DEFAULT 'queued',
  364. retry_at DATETIME,
  365. num_uses_failed INTEGER NOT NULL DEFAULT 0,
  366. num_uses_succeeded INTEGER NOT NULL DEFAULT 0
  367. );
  368. -- Core Snapshot table (0.8.x with UUID PK, status, crawl FK)
  369. CREATE TABLE IF NOT EXISTS core_snapshot (
  370. id CHAR(36) PRIMARY KEY,
  371. created_by_id INTEGER NOT NULL REFERENCES auth_user(id),
  372. created_at DATETIME NOT NULL,
  373. modified_at DATETIME,
  374. url VARCHAR(2000) NOT NULL,
  375. timestamp VARCHAR(32) NOT NULL UNIQUE,
  376. bookmarked_at DATETIME NOT NULL,
  377. crawl_id CHAR(36) REFERENCES crawls_crawl(id),
  378. title VARCHAR(512),
  379. downloaded_at DATETIME,
  380. depth SMALLINT UNSIGNED NOT NULL DEFAULT 0,
  381. retry_at DATETIME,
  382. status VARCHAR(16) NOT NULL DEFAULT 'queued',
  383. config TEXT DEFAULT '{}',
  384. notes TEXT NOT NULL DEFAULT '',
  385. output_dir VARCHAR(256),
  386. num_uses_failed INTEGER NOT NULL DEFAULT 0,
  387. num_uses_succeeded INTEGER NOT NULL DEFAULT 0
  388. );
  389. CREATE INDEX IF NOT EXISTS core_snapshot_url ON core_snapshot(url);
  390. CREATE INDEX IF NOT EXISTS core_snapshot_timestamp ON core_snapshot(timestamp);
  391. CREATE INDEX IF NOT EXISTS core_snapshot_created_at ON core_snapshot(created_at);
  392. -- Many-to-many for snapshot tags
  393. CREATE TABLE IF NOT EXISTS core_snapshot_tags (
  394. id INTEGER PRIMARY KEY AUTOINCREMENT,
  395. snapshot_id CHAR(36) NOT NULL REFERENCES core_snapshot(id),
  396. tag_id INTEGER NOT NULL REFERENCES core_tag(id),
  397. UNIQUE(snapshot_id, tag_id)
  398. );
  399. -- Core ArchiveResult table (0.8.x with AutoField PK + UUID, status)
  400. CREATE TABLE IF NOT EXISTS core_archiveresult (
  401. id INTEGER PRIMARY KEY AUTOINCREMENT,
  402. uuid CHAR(36) UNIQUE,
  403. created_by_id INTEGER NOT NULL REFERENCES auth_user(id),
  404. created_at DATETIME NOT NULL,
  405. modified_at DATETIME,
  406. snapshot_id CHAR(36) NOT NULL REFERENCES core_snapshot(id),
  407. extractor VARCHAR(32) NOT NULL,
  408. pwd VARCHAR(256),
  409. cmd TEXT,
  410. cmd_version VARCHAR(128),
  411. output VARCHAR(1024),
  412. start_ts DATETIME,
  413. end_ts DATETIME,
  414. status VARCHAR(16) NOT NULL DEFAULT 'queued',
  415. retry_at DATETIME,
  416. notes TEXT NOT NULL DEFAULT '',
  417. output_dir VARCHAR(256),
  418. iface_id INTEGER,
  419. config TEXT DEFAULT '{}',
  420. num_uses_failed INTEGER NOT NULL DEFAULT 0,
  421. num_uses_succeeded INTEGER NOT NULL DEFAULT 0
  422. );
  423. CREATE INDEX IF NOT EXISTS core_archiveresult_snapshot ON core_archiveresult(snapshot_id);
  424. CREATE INDEX IF NOT EXISTS core_archiveresult_extractor ON core_archiveresult(extractor);
  425. -- Insert required content types
  426. INSERT INTO django_content_type (app_label, model) VALUES
  427. ('contenttypes', 'contenttype'),
  428. ('auth', 'permission'),
  429. ('auth', 'group'),
  430. ('auth', 'user'),
  431. ('admin', 'logentry'),
  432. ('sessions', 'session'),
  433. ('core', 'snapshot'),
  434. ('core', 'archiveresult'),
  435. ('core', 'tag'),
  436. ('machine', 'machine'),
  437. ('machine', 'networkinterface'),
  438. ('machine', 'dependency'),
  439. ('machine', 'binary'),
  440. ('crawls', 'crawl'),
  441. ('crawls', 'crawlschedule'),
  442. ('crawls', 'seed'),
  443. ('api', 'apitoken'),
  444. ('api', 'outboundwebhook');
  445. """
  446. # =============================================================================
  447. # Test Data Generators
  448. # =============================================================================
  449. def generate_uuid() -> str:
  450. """Generate a UUID string without dashes for SQLite."""
  451. return uuid4().hex
  452. def generate_timestamp() -> str:
  453. """Generate a timestamp string like ArchiveBox uses."""
  454. return datetime.now(timezone.utc).strftime('%Y%m%d%H%M%S') + '.000000'
  455. def seed_0_4_data(db_path: Path) -> Dict[str, List[Dict]]:
  456. """Seed a 0.4.x database with realistic test data."""
  457. conn = sqlite3.connect(str(db_path))
  458. cursor = conn.cursor()
  459. created_data = {
  460. 'snapshots': [],
  461. 'tags_str': [],
  462. }
  463. test_urls = [
  464. ('https://example.com/page1', 'Example Page 1', 'news,tech'),
  465. ('https://example.org/article', 'Article Title', 'blog,reading'),
  466. ('https://github.com/user/repo', 'GitHub Repository', 'code,github'),
  467. ('https://news.ycombinator.com/item?id=12345', 'HN Discussion', 'news,discussion'),
  468. ('https://en.wikipedia.org/wiki/Test', 'Wikipedia Test', 'reference,wiki'),
  469. ]
  470. for i, (url, title, tags) in enumerate(test_urls):
  471. snapshot_id = generate_uuid()
  472. timestamp = f'2024010{i+1}120000.000000'
  473. added = f'2024-01-0{i+1} 12:00:00'
  474. cursor.execute("""
  475. INSERT INTO core_snapshot (id, url, timestamp, title, tags, added, updated)
  476. VALUES (?, ?, ?, ?, ?, ?, ?)
  477. """, (snapshot_id, url, timestamp, title, tags, added, added))
  478. created_data['snapshots'].append({
  479. 'id': snapshot_id,
  480. 'url': url,
  481. 'timestamp': timestamp,
  482. 'title': title,
  483. 'tags': tags,
  484. })
  485. created_data['tags_str'].append(tags)
  486. cursor.execute("""
  487. INSERT INTO django_migrations (app, name, applied)
  488. VALUES ('core', '0001_initial', datetime('now'))
  489. """)
  490. conn.commit()
  491. conn.close()
  492. return created_data
  493. def seed_0_7_data(db_path: Path) -> Dict[str, List[Dict]]:
  494. """Seed a 0.7.x database with realistic test data."""
  495. conn = sqlite3.connect(str(db_path))
  496. cursor = conn.cursor()
  497. created_data = {
  498. 'users': [],
  499. 'snapshots': [],
  500. 'tags': [],
  501. 'archiveresults': [],
  502. }
  503. # Create a user
  504. cursor.execute("""
  505. INSERT INTO auth_user (password, is_superuser, username, first_name, last_name,
  506. email, is_staff, is_active, date_joined)
  507. VALUES ('pbkdf2_sha256$test', 1, 'admin', 'Admin', 'User',
  508. '[email protected]', 1, 1, datetime('now'))
  509. """)
  510. user_id = cursor.lastrowid
  511. created_data['users'].append({'id': user_id, 'username': 'admin'})
  512. # Create 5 tags
  513. tag_names = ['news', 'tech', 'blog', 'reference', 'code']
  514. for name in tag_names:
  515. cursor.execute("""
  516. INSERT INTO core_tag (name, slug) VALUES (?, ?)
  517. """, (name, name.lower()))
  518. tag_id = cursor.lastrowid
  519. created_data['tags'].append({'id': tag_id, 'name': name, 'slug': name.lower()})
  520. # Create 5 snapshots
  521. test_urls = [
  522. ('https://example.com/page1', 'Example Page 1'),
  523. ('https://example.org/article', 'Article Title'),
  524. ('https://github.com/user/repo', 'GitHub Repository'),
  525. ('https://news.ycombinator.com/item?id=12345', 'HN Discussion'),
  526. ('https://en.wikipedia.org/wiki/Test', 'Wikipedia Test'),
  527. ]
  528. for i, (url, title) in enumerate(test_urls):
  529. snapshot_id = generate_uuid()
  530. timestamp = f'2024010{i+1}120000.000000'
  531. added = f'2024-01-0{i+1} 12:00:00'
  532. cursor.execute("""
  533. INSERT INTO core_snapshot (id, url, timestamp, title, added, updated)
  534. VALUES (?, ?, ?, ?, ?, ?)
  535. """, (snapshot_id, url, timestamp, title, added, added))
  536. created_data['snapshots'].append({
  537. 'id': snapshot_id,
  538. 'url': url,
  539. 'timestamp': timestamp,
  540. 'title': title,
  541. })
  542. # Assign 2 tags to each snapshot
  543. tag_ids = [created_data['tags'][i % 5]['id'], created_data['tags'][(i + 1) % 5]['id']]
  544. for tag_id in tag_ids:
  545. cursor.execute("""
  546. INSERT INTO core_snapshot_tags (snapshot_id, tag_id) VALUES (?, ?)
  547. """, (snapshot_id, tag_id))
  548. # Create 5 archive results for each snapshot
  549. extractors = ['title', 'favicon', 'screenshot', 'singlefile', 'wget']
  550. statuses = ['succeeded', 'succeeded', 'failed', 'succeeded', 'skipped']
  551. for j, (extractor, status) in enumerate(zip(extractors, statuses)):
  552. cursor.execute("""
  553. INSERT INTO core_archiveresult
  554. (snapshot_id, extractor, cmd, pwd, cmd_version, output, start_ts, end_ts, status)
  555. VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
  556. """, (
  557. snapshot_id, extractor,
  558. json.dumps([extractor, '--version']),
  559. f'/data/archive/{timestamp}',
  560. '1.0.0',
  561. f'{extractor}/index.html' if status == 'succeeded' else '',
  562. f'2024-01-0{i+1} 12:00:0{j}',
  563. f'2024-01-0{i+1} 12:00:1{j}',
  564. status
  565. ))
  566. created_data['archiveresults'].append({
  567. 'snapshot_id': snapshot_id,
  568. 'extractor': extractor,
  569. 'status': status,
  570. })
  571. # Record migrations as applied (0.7.x migrations up to 0022)
  572. migrations = [
  573. ('contenttypes', '0001_initial'),
  574. ('contenttypes', '0002_remove_content_type_name'),
  575. ('auth', '0001_initial'),
  576. ('auth', '0002_alter_permission_name_max_length'),
  577. ('auth', '0003_alter_user_email_max_length'),
  578. ('auth', '0004_alter_user_username_opts'),
  579. ('auth', '0005_alter_user_last_login_null'),
  580. ('auth', '0006_require_contenttypes_0002'),
  581. ('auth', '0007_alter_validators_add_error_messages'),
  582. ('auth', '0008_alter_user_username_max_length'),
  583. ('auth', '0009_alter_user_last_name_max_length'),
  584. ('auth', '0010_alter_group_name_max_length'),
  585. ('auth', '0011_update_proxy_permissions'),
  586. ('auth', '0012_alter_user_first_name_max_length'),
  587. ('admin', '0001_initial'),
  588. ('admin', '0002_logentry_remove_auto_add'),
  589. ('admin', '0003_logentry_add_action_flag_choices'),
  590. ('sessions', '0001_initial'),
  591. ('core', '0001_initial'),
  592. ('core', '0002_auto_20200625_1521'),
  593. ('core', '0003_auto_20200630_1034'),
  594. ('core', '0004_auto_20200713_1552'),
  595. ('core', '0005_auto_20200728_0326'),
  596. ('core', '0006_auto_20201012_1520'),
  597. ('core', '0007_archiveresult'),
  598. ('core', '0008_auto_20210105_1421'),
  599. ('core', '0009_auto_20210216_1038'),
  600. ('core', '0010_auto_20210216_1055'),
  601. ('core', '0011_auto_20210216_1331'),
  602. ('core', '0012_auto_20210216_1425'),
  603. ('core', '0013_auto_20210218_0729'),
  604. ('core', '0014_auto_20210218_0729'),
  605. ('core', '0015_auto_20210218_0730'),
  606. ('core', '0016_auto_20210218_1204'),
  607. ('core', '0017_auto_20210219_0211'),
  608. ('core', '0018_auto_20210327_0952'),
  609. ('core', '0019_auto_20210401_0654'),
  610. ('core', '0020_auto_20210410_1031'),
  611. ('core', '0021_auto_20220914_0934'),
  612. ('core', '0022_auto_20231023_2008'),
  613. ]
  614. for app, name in migrations:
  615. cursor.execute("""
  616. INSERT INTO django_migrations (app, name, applied)
  617. VALUES (?, ?, datetime('now'))
  618. """, (app, name))
  619. conn.commit()
  620. conn.close()
  621. return created_data
  622. def seed_0_8_data(db_path: Path) -> Dict[str, List[Dict]]:
  623. """Seed a 0.8.x database with realistic test data including Crawls."""
  624. conn = sqlite3.connect(str(db_path))
  625. cursor = conn.cursor()
  626. created_data = {
  627. 'users': [],
  628. 'crawls': [],
  629. 'snapshots': [],
  630. 'tags': [],
  631. 'archiveresults': [],
  632. }
  633. # Create a user
  634. cursor.execute("""
  635. INSERT INTO auth_user (password, is_superuser, username, first_name, last_name,
  636. email, is_staff, is_active, date_joined)
  637. VALUES ('pbkdf2_sha256$test', 1, 'admin', 'Admin', 'User',
  638. '[email protected]', 1, 1, datetime('now'))
  639. """)
  640. user_id = cursor.lastrowid
  641. created_data['users'].append({'id': user_id, 'username': 'admin'})
  642. # Create 5 tags
  643. tag_names = ['news', 'tech', 'blog', 'reference', 'code']
  644. for name in tag_names:
  645. cursor.execute("""
  646. INSERT INTO core_tag (name, slug, created_at, modified_at, created_by_id)
  647. VALUES (?, ?, datetime('now'), datetime('now'), ?)
  648. """, (name, name.lower(), user_id))
  649. tag_id = cursor.lastrowid
  650. created_data['tags'].append({'id': tag_id, 'name': name, 'slug': name.lower()})
  651. # Create 2 Crawls (0.9.0 schema - no seeds)
  652. test_crawls = [
  653. ('https://example.com\nhttps://example.org', 0, 'Example Crawl'),
  654. ('https://github.com/ArchiveBox', 1, 'GitHub Crawl'),
  655. ]
  656. for i, (urls, max_depth, label) in enumerate(test_crawls):
  657. crawl_id = generate_uuid()
  658. cursor.execute("""
  659. INSERT INTO crawls_crawl (id, created_at, created_by_id, modified_at, urls,
  660. config, max_depth, tags_str, label, status, retry_at,
  661. num_uses_failed, num_uses_succeeded)
  662. VALUES (?, datetime('now'), ?, datetime('now'), ?, '{}', ?, '', ?, 'queued', datetime('now'), 0, 0)
  663. """, (crawl_id, user_id, urls, max_depth, label))
  664. created_data['crawls'].append({
  665. 'id': crawl_id,
  666. 'urls': urls,
  667. 'max_depth': max_depth,
  668. 'label': label,
  669. })
  670. # Create 5 snapshots linked to crawls
  671. test_urls = [
  672. ('https://example.com/page1', 'Example Page 1', created_data['crawls'][0]['id']),
  673. ('https://example.org/article', 'Article Title', created_data['crawls'][0]['id']),
  674. ('https://github.com/user/repo', 'GitHub Repository', created_data['crawls'][1]['id']),
  675. ('https://news.ycombinator.com/item?id=12345', 'HN Discussion', None),
  676. ('https://en.wikipedia.org/wiki/Test', 'Wikipedia Test', None),
  677. ]
  678. for i, (url, title, crawl_id) in enumerate(test_urls):
  679. snapshot_id = generate_uuid()
  680. timestamp = f'2024010{i+1}120000.000000'
  681. created_at = f'2024-01-0{i+1} 12:00:00'
  682. cursor.execute("""
  683. INSERT INTO core_snapshot (id, created_by_id, created_at, modified_at, url, timestamp,
  684. bookmarked_at, crawl_id, title, depth, status, config, notes)
  685. VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, 0, 'queued', '{}', '')
  686. """, (snapshot_id, user_id, created_at, created_at, url, timestamp, created_at, crawl_id, title))
  687. created_data['snapshots'].append({
  688. 'id': snapshot_id,
  689. 'url': url,
  690. 'timestamp': timestamp,
  691. 'title': title,
  692. 'crawl_id': crawl_id,
  693. })
  694. # Assign 2 tags to each snapshot
  695. tag_ids = [created_data['tags'][i % 5]['id'], created_data['tags'][(i + 1) % 5]['id']]
  696. for tag_id in tag_ids:
  697. cursor.execute("""
  698. INSERT INTO core_snapshot_tags (snapshot_id, tag_id) VALUES (?, ?)
  699. """, (snapshot_id, tag_id))
  700. # Create 5 archive results for each snapshot
  701. extractors = ['title', 'favicon', 'screenshot', 'singlefile', 'wget']
  702. statuses = ['succeeded', 'succeeded', 'failed', 'succeeded', 'skipped']
  703. for j, (extractor, status) in enumerate(zip(extractors, statuses)):
  704. result_uuid = generate_uuid()
  705. cursor.execute("""
  706. INSERT INTO core_archiveresult
  707. (uuid, created_by_id, created_at, modified_at, snapshot_id, extractor, pwd,
  708. cmd, cmd_version, output, start_ts, end_ts, status, retry_at, notes, output_dir)
  709. VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now'), '', ?)
  710. """, (
  711. result_uuid, user_id, f'2024-01-0{i+1} 12:00:0{j}', f'2024-01-0{i+1} 12:00:1{j}',
  712. snapshot_id, extractor,
  713. f'/data/archive/{timestamp}',
  714. json.dumps([extractor, '--version']),
  715. '1.0.0',
  716. f'{extractor}/index.html' if status == 'succeeded' else '',
  717. f'2024-01-0{i+1} 12:00:0{j}',
  718. f'2024-01-0{i+1} 12:00:1{j}',
  719. status,
  720. f'{extractor}',
  721. ))
  722. created_data['archiveresults'].append({
  723. 'uuid': result_uuid,
  724. 'snapshot_id': snapshot_id,
  725. 'extractor': extractor,
  726. 'status': status,
  727. })
  728. # Record migrations as applied (0.8.x migrations)
  729. migrations = [
  730. ('contenttypes', '0001_initial'),
  731. ('contenttypes', '0002_remove_content_type_name'),
  732. ('auth', '0001_initial'),
  733. ('auth', '0002_alter_permission_name_max_length'),
  734. ('auth', '0003_alter_user_email_max_length'),
  735. ('auth', '0004_alter_user_username_opts'),
  736. ('auth', '0005_alter_user_last_login_null'),
  737. ('auth', '0006_require_contenttypes_0002'),
  738. ('auth', '0007_alter_validators_add_error_messages'),
  739. ('auth', '0008_alter_user_username_max_length'),
  740. ('auth', '0009_alter_user_last_name_max_length'),
  741. ('auth', '0010_alter_group_name_max_length'),
  742. ('auth', '0011_update_proxy_permissions'),
  743. ('auth', '0012_alter_user_first_name_max_length'),
  744. ('admin', '0001_initial'),
  745. ('admin', '0002_logentry_remove_auto_add'),
  746. ('admin', '0003_logentry_add_action_flag_choices'),
  747. ('sessions', '0001_initial'),
  748. ('core', '0001_initial'),
  749. ('core', '0002_auto_20200625_1521'),
  750. ('core', '0003_auto_20200630_1034'),
  751. ('core', '0004_auto_20200713_1552'),
  752. ('core', '0005_auto_20200728_0326'),
  753. ('core', '0006_auto_20201012_1520'),
  754. ('core', '0007_archiveresult'),
  755. ('core', '0008_auto_20210105_1421'),
  756. ('core', '0009_auto_20210216_1038'),
  757. ('core', '0010_auto_20210216_1055'),
  758. ('core', '0011_auto_20210216_1331'),
  759. ('core', '0012_auto_20210216_1425'),
  760. ('core', '0013_auto_20210218_0729'),
  761. ('core', '0014_auto_20210218_0729'),
  762. ('core', '0015_auto_20210218_0730'),
  763. ('core', '0016_auto_20210218_1204'),
  764. ('core', '0017_auto_20210219_0211'),
  765. ('core', '0018_auto_20210327_0952'),
  766. ('core', '0019_auto_20210401_0654'),
  767. ('core', '0020_auto_20210410_1031'),
  768. ('core', '0021_auto_20220914_0934'),
  769. ('core', '0022_auto_20231023_2008'),
  770. # For 0.8.x (dev branch), record the migrations that 0023_new_schema replaces
  771. ('core', '0023_alter_archiveresult_options_archiveresult_abid_and_more'),
  772. ('core', '0024_auto_20240513_1143'),
  773. ('core', '0025_alter_archiveresult_uuid'),
  774. ('core', '0026_archiveresult_created_archiveresult_created_by_and_more'),
  775. ('core', '0027_update_snapshot_ids'),
  776. ('core', '0028_alter_archiveresult_uuid'),
  777. ('core', '0029_alter_archiveresult_id'),
  778. ('core', '0030_alter_archiveresult_uuid'),
  779. ('core', '0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more'),
  780. ('core', '0032_alter_archiveresult_id'),
  781. ('core', '0033_rename_id_archiveresult_old_id'),
  782. ('core', '0034_alter_archiveresult_old_id_alter_archiveresult_uuid'),
  783. ('core', '0035_remove_archiveresult_uuid_archiveresult_id'),
  784. ('core', '0036_alter_archiveresult_id_alter_archiveresult_old_id'),
  785. ('core', '0037_rename_id_snapshot_old_id'),
  786. ('core', '0038_rename_uuid_snapshot_id'),
  787. ('core', '0039_rename_snapshot_archiveresult_snapshot_old'),
  788. ('core', '0040_archiveresult_snapshot'),
  789. ('core', '0041_alter_archiveresult_snapshot_and_more'),
  790. ('core', '0042_remove_archiveresult_snapshot_old'),
  791. ('core', '0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
  792. ('core', '0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more'),
  793. ('core', '0045_alter_snapshot_old_id'),
  794. ('core', '0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
  795. ('core', '0047_alter_snapshottag_unique_together_and_more'),
  796. ('core', '0048_alter_archiveresult_snapshot_and_more'),
  797. ('core', '0049_rename_snapshot_snapshottag_snapshot_old_and_more'),
  798. ('core', '0050_alter_snapshottag_snapshot_old'),
  799. ('core', '0051_snapshottag_snapshot_alter_snapshottag_snapshot_old'),
  800. ('core', '0052_alter_snapshottag_unique_together_and_more'),
  801. ('core', '0053_remove_snapshottag_snapshot_old'),
  802. ('core', '0054_alter_snapshot_timestamp'),
  803. ('core', '0055_alter_tag_slug'),
  804. ('core', '0056_remove_tag_uuid'),
  805. ('core', '0057_rename_id_tag_old_id'),
  806. ('core', '0058_alter_tag_old_id'),
  807. ('core', '0059_tag_id'),
  808. ('core', '0060_alter_tag_id'),
  809. ('core', '0061_rename_tag_snapshottag_old_tag_and_more'),
  810. ('core', '0062_alter_snapshottag_old_tag'),
  811. ('core', '0063_snapshottag_tag_alter_snapshottag_old_tag'),
  812. ('core', '0064_alter_snapshottag_unique_together_and_more'),
  813. ('core', '0065_remove_snapshottag_old_tag'),
  814. ('core', '0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id'),
  815. ('core', '0067_alter_snapshottag_tag'),
  816. ('core', '0068_alter_archiveresult_options'),
  817. ('core', '0069_alter_archiveresult_created_alter_snapshot_added_and_more'),
  818. ('core', '0070_alter_archiveresult_created_by_alter_snapshot_added_and_more'),
  819. ('core', '0071_remove_archiveresult_old_id_remove_snapshot_old_id_and_more'),
  820. ('core', '0072_rename_added_snapshot_bookmarked_at_and_more'),
  821. ('core', '0073_rename_created_archiveresult_created_at_and_more'),
  822. ('core', '0074_alter_snapshot_downloaded_at'),
  823. # For 0.8.x: DO NOT record 0023_new_schema - it replaces 0023-0074 for fresh installs
  824. # We already recorded 0023-0074 above, so Django will know the state
  825. # For 0.8.x: Record original machine migrations (before squashing)
  826. # DO NOT record 0001_squashed here - it replaces 0001-0004 for fresh installs
  827. ('machine', '0001_initial'),
  828. ('machine', '0002_alter_machine_stats_installedbinary'),
  829. ('machine', '0003_alter_installedbinary_options_and_more'),
  830. ('machine', '0004_alter_installedbinary_abspath_and_more'),
  831. # Then the new migrations after squashing
  832. ('machine', '0002_rename_custom_cmds_to_overrides'),
  833. ('machine', '0003_alter_dependency_id_alter_installedbinary_dependency_and_more'),
  834. ('machine', '0004_drop_dependency_table'),
  835. # Crawls must come before core.0024 because 0024_b depends on it
  836. ('crawls', '0001_initial'),
  837. # Core 0024 migrations chain (in dependency order)
  838. ('core', '0024_b_clear_config_fields'),
  839. ('core', '0024_c_disable_fk_checks'),
  840. ('core', '0024_d_fix_crawls_config'),
  841. ('core', '0024_snapshot_crawl'),
  842. ('core', '0024_f_add_snapshot_config'),
  843. ('core', '0025_allow_duplicate_urls_per_crawl'),
  844. # For 0.8.x: Record original api migration (before squashing)
  845. # DO NOT record 0001_squashed here - it replaces 0001 for fresh installs
  846. ('api', '0001_initial'),
  847. ('api', '0002_alter_apitoken_options'),
  848. ('api', '0003_rename_user_apitoken_created_by_apitoken_abid_and_more'),
  849. ('api', '0004_alter_apitoken_id_alter_apitoken_uuid'),
  850. ('api', '0005_remove_apitoken_uuid_remove_outboundwebhook_uuid_and_more'),
  851. ('api', '0006_remove_outboundwebhook_uuid_apitoken_id_and_more'),
  852. ('api', '0007_alter_apitoken_created_by'),
  853. ('api', '0008_alter_apitoken_created_alter_apitoken_created_by_and_more'),
  854. ('api', '0009_rename_created_apitoken_created_at_and_more'),
  855. # Note: crawls.0001_initial moved earlier (before core.0024) due to dependencies
  856. # Stop here - 0.8.x ends at core.0025, crawls.0001, and we want to TEST the later migrations
  857. # Do NOT record 0026+ as they need to be tested during migration
  858. ]
  859. for app, name in migrations:
  860. cursor.execute("""
  861. INSERT INTO django_migrations (app, name, applied)
  862. VALUES (?, ?, datetime('now'))
  863. """, (app, name))
  864. conn.commit()
  865. conn.close()
  866. return created_data
  867. # =============================================================================
  868. # Helper Functions
  869. # =============================================================================
  870. def run_archivebox(data_dir: Path, args: list, timeout: int = 60, env: dict = None) -> subprocess.CompletedProcess:
  871. """Run archivebox command in subprocess with given data directory."""
  872. base_env = os.environ.copy()
  873. base_env['DATA_DIR'] = str(data_dir)
  874. base_env['USE_COLOR'] = 'False'
  875. base_env['SHOW_PROGRESS'] = 'False'
  876. # Disable ALL extractors for faster tests (can be overridden by env parameter)
  877. base_env['SAVE_ARCHIVEDOTORG'] = 'False'
  878. base_env['SAVE_TITLE'] = 'False'
  879. base_env['SAVE_FAVICON'] = 'False'
  880. base_env['SAVE_WGET'] = 'False'
  881. base_env['SAVE_SINGLEFILE'] = 'False'
  882. base_env['SAVE_SCREENSHOT'] = 'False'
  883. base_env['SAVE_PDF'] = 'False'
  884. base_env['SAVE_DOM'] = 'False'
  885. base_env['SAVE_READABILITY'] = 'False'
  886. base_env['SAVE_MERCURY'] = 'False'
  887. base_env['SAVE_GIT'] = 'False'
  888. base_env['SAVE_YTDLP'] = 'False'
  889. base_env['SAVE_HEADERS'] = 'False'
  890. base_env['SAVE_HTMLTOTEXT'] = 'False'
  891. # Override with any custom env vars
  892. if env:
  893. base_env.update(env)
  894. cmd = [sys.executable, '-m', 'archivebox'] + args
  895. return subprocess.run(
  896. cmd,
  897. capture_output=True,
  898. text=True,
  899. env=base_env,
  900. cwd=str(data_dir),
  901. timeout=timeout,
  902. )
  903. def create_data_dir_structure(data_dir: Path):
  904. """Create the basic ArchiveBox data directory structure."""
  905. (data_dir / 'archive').mkdir(parents=True, exist_ok=True)
  906. (data_dir / 'sources').mkdir(parents=True, exist_ok=True)
  907. (data_dir / 'logs').mkdir(parents=True, exist_ok=True)
  908. def verify_snapshot_count(db_path: Path, expected: int) -> Tuple[bool, str]:
  909. """Verify the number of snapshots in the database."""
  910. conn = sqlite3.connect(str(db_path))
  911. cursor = conn.cursor()
  912. cursor.execute("SELECT COUNT(*) FROM core_snapshot")
  913. count = cursor.fetchone()[0]
  914. conn.close()
  915. if count == expected:
  916. return True, f"Snapshot count OK: {count}"
  917. return False, f"Snapshot count mismatch: expected {expected}, got {count}"
  918. def verify_tag_count(db_path: Path, expected: int) -> Tuple[bool, str]:
  919. """Verify the number of tags in the database (exact match)."""
  920. conn = sqlite3.connect(str(db_path))
  921. cursor = conn.cursor()
  922. cursor.execute("SELECT COUNT(*) FROM core_tag")
  923. count = cursor.fetchone()[0]
  924. conn.close()
  925. if count == expected:
  926. return True, f"Tag count OK: {count}"
  927. return False, f"Tag count mismatch: expected {expected}, got {count}"
  928. def verify_archiveresult_count(db_path: Path, expected: int) -> Tuple[bool, str]:
  929. """Verify the number of archive results in the database."""
  930. conn = sqlite3.connect(str(db_path))
  931. cursor = conn.cursor()
  932. cursor.execute("SELECT COUNT(*) FROM core_archiveresult")
  933. count = cursor.fetchone()[0]
  934. conn.close()
  935. if count == expected:
  936. return True, f"ArchiveResult count OK: {count}"
  937. return False, f"ArchiveResult count mismatch: expected {expected}, got {count}"
  938. def verify_snapshot_urls(db_path: Path, expected_urls: List[str]) -> Tuple[bool, str]:
  939. """Verify ALL expected URLs exist in snapshots."""
  940. conn = sqlite3.connect(str(db_path))
  941. cursor = conn.cursor()
  942. cursor.execute("SELECT url FROM core_snapshot")
  943. actual_urls = {row[0] for row in cursor.fetchall()}
  944. conn.close()
  945. missing = set(expected_urls) - actual_urls
  946. if not missing:
  947. return True, "All URLs preserved"
  948. return False, f"Missing URLs: {missing}"
  949. def verify_snapshot_titles(db_path: Path, expected_titles: Dict[str, str]) -> Tuple[bool, str]:
  950. """Verify ALL snapshot titles are preserved."""
  951. conn = sqlite3.connect(str(db_path))
  952. cursor = conn.cursor()
  953. cursor.execute("SELECT url, title FROM core_snapshot")
  954. actual = {row[0]: row[1] for row in cursor.fetchall()}
  955. conn.close()
  956. mismatches = []
  957. for url, expected_title in expected_titles.items():
  958. if url not in actual:
  959. mismatches.append(f"{url}: missing from database")
  960. elif actual[url] != expected_title:
  961. mismatches.append(f"{url}: expected '{expected_title}', got '{actual[url]}'")
  962. if not mismatches:
  963. return True, "All titles preserved"
  964. return False, f"Title mismatches: {mismatches}"
  965. def verify_foreign_keys(db_path: Path) -> Tuple[bool, str]:
  966. """Verify foreign key relationships are intact."""
  967. conn = sqlite3.connect(str(db_path))
  968. cursor = conn.cursor()
  969. # Check ArchiveResult -> Snapshot FK
  970. cursor.execute("""
  971. SELECT COUNT(*) FROM core_archiveresult ar
  972. WHERE NOT EXISTS (SELECT 1 FROM core_snapshot s WHERE s.id = ar.snapshot_id)
  973. """)
  974. orphaned_results = cursor.fetchone()[0]
  975. conn.close()
  976. if orphaned_results == 0:
  977. return True, "Foreign keys intact"
  978. return False, f"Found {orphaned_results} orphaned ArchiveResults"
  979. def verify_all_snapshots_in_output(output: str, snapshots: List[Dict]) -> Tuple[bool, str]:
  980. """Verify ALL snapshots appear in command output (not just one)."""
  981. missing = []
  982. for snapshot in snapshots:
  983. url_fragment = snapshot['url'][:30]
  984. title = snapshot.get('title', '')
  985. if url_fragment not in output and (not title or title not in output):
  986. missing.append(snapshot['url'])
  987. if not missing:
  988. return True, "All snapshots found in output"
  989. return False, f"Missing snapshots in output: {missing}"
  990. def verify_crawl_count(db_path: Path, expected: int) -> Tuple[bool, str]:
  991. """Verify the number of crawls in the database."""
  992. conn = sqlite3.connect(str(db_path))
  993. cursor = conn.cursor()
  994. cursor.execute("SELECT COUNT(*) FROM crawls_crawl")
  995. count = cursor.fetchone()[0]
  996. conn.close()
  997. if count == expected:
  998. return True, f"Crawl count OK: {count}"
  999. return False, f"Crawl count mismatch: expected {expected}, got {count}"
  1000. def verify_process_migration(db_path: Path, expected_archiveresult_count: int) -> Tuple[bool, str]:
  1001. """
  1002. Verify that ArchiveResults were properly migrated to Process records.
  1003. Checks:
  1004. 1. All ArchiveResults have process_id set
  1005. 2. Process count matches ArchiveResult count
  1006. 3. Binary records created for unique cmd_version values
  1007. 4. Status mapping is correct
  1008. """
  1009. conn = sqlite3.connect(str(db_path))
  1010. cursor = conn.cursor()
  1011. # Check all ArchiveResults have process_id
  1012. cursor.execute("SELECT COUNT(*) FROM core_archiveresult WHERE process_id IS NULL")
  1013. null_count = cursor.fetchone()[0]
  1014. if null_count > 0:
  1015. conn.close()
  1016. return False, f"Found {null_count} ArchiveResults without process_id"
  1017. # Check Process count
  1018. cursor.execute("SELECT COUNT(*) FROM machine_process")
  1019. process_count = cursor.fetchone()[0]
  1020. if process_count != expected_archiveresult_count:
  1021. conn.close()
  1022. return False, f"Expected {expected_archiveresult_count} Processes, got {process_count}"
  1023. # Check status mapping
  1024. cursor.execute("""
  1025. SELECT ar.status, p.status, p.exit_code
  1026. FROM core_archiveresult ar
  1027. JOIN machine_process p ON ar.process_id = p.id
  1028. """)
  1029. status_errors = []
  1030. for ar_status, p_status, p_exit_code in cursor.fetchall():
  1031. expected_p_status, expected_exit_code = {
  1032. 'queued': ('queued', None),
  1033. 'started': ('running', None),
  1034. 'backoff': ('queued', None),
  1035. 'succeeded': ('exited', 0),
  1036. 'failed': ('exited', 1),
  1037. 'skipped': ('exited', None),
  1038. }.get(ar_status, ('queued', None))
  1039. if p_status != expected_p_status:
  1040. status_errors.append(f"AR status {ar_status} → Process {p_status}, expected {expected_p_status}")
  1041. if p_exit_code != expected_exit_code:
  1042. status_errors.append(f"AR status {ar_status} → exit_code {p_exit_code}, expected {expected_exit_code}")
  1043. if status_errors:
  1044. conn.close()
  1045. return False, f"Status mapping errors: {'; '.join(status_errors[:5])}"
  1046. conn.close()
  1047. return True, f"Process migration verified: {process_count} Processes created"