test_migrations_08_to_09.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717
  1. #!/usr/bin/env python3
  2. """
  3. Migration tests from 0.8.x to 0.9.x.
  4. 0.8.x introduced:
  5. - Crawl model for grouping URLs
  6. - Seed model (removed in 0.9.x)
  7. - UUID primary keys for Snapshot
  8. - Status fields for state machine
  9. - New fields like depth, retry_at, etc.
  10. """
  11. import json
  12. import shutil
  13. import sqlite3
  14. import subprocess
  15. import tempfile
  16. import unittest
  17. from pathlib import Path
  18. from .test_migrations_helpers import (
  19. SCHEMA_0_8,
  20. seed_0_8_data,
  21. run_archivebox,
  22. create_data_dir_structure,
  23. verify_snapshot_count,
  24. verify_snapshot_urls,
  25. verify_snapshot_titles,
  26. verify_tag_count,
  27. verify_archiveresult_count,
  28. verify_foreign_keys,
  29. verify_all_snapshots_in_output,
  30. verify_crawl_count,
  31. verify_process_migration,
  32. )
  33. class TestMigrationFrom08x(unittest.TestCase):
  34. """Test migration from 0.8.x schema to latest."""
  35. def setUp(self):
  36. """Create a temporary directory with 0.8.x schema and data."""
  37. self.work_dir = Path(tempfile.mkdtemp())
  38. self.db_path = self.work_dir / 'index.sqlite3'
  39. # Create directory structure
  40. create_data_dir_structure(self.work_dir)
  41. # Create database with 0.8.x schema
  42. conn = sqlite3.connect(str(self.db_path))
  43. conn.executescript(SCHEMA_0_8)
  44. conn.close()
  45. # Seed with test data
  46. self.original_data = seed_0_8_data(self.db_path)
  47. def tearDown(self):
  48. """Clean up temporary directory."""
  49. shutil.rmtree(self.work_dir, ignore_errors=True)
  50. def test_migration_preserves_snapshot_count(self):
  51. """Migration should preserve all snapshots from 0.8.x."""
  52. expected_count = len(self.original_data['snapshots'])
  53. result = run_archivebox(self.work_dir, ['init'], timeout=45)
  54. self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
  55. ok, msg = verify_snapshot_count(self.db_path, expected_count)
  56. self.assertTrue(ok, msg)
  57. def test_migration_preserves_snapshot_urls(self):
  58. """Migration should preserve all snapshot URLs from 0.8.x."""
  59. expected_urls = [s['url'] for s in self.original_data['snapshots']]
  60. result = run_archivebox(self.work_dir, ['init'], timeout=45)
  61. self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
  62. ok, msg = verify_snapshot_urls(self.db_path, expected_urls)
  63. self.assertTrue(ok, msg)
  64. def test_migration_preserves_crawls(self):
  65. """Migration should preserve all Crawl records and create default crawl if needed."""
  66. result = run_archivebox(self.work_dir, ['init'], timeout=45)
  67. self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
  68. # Count snapshots with NULL crawl_id in original data
  69. snapshots_without_crawl = sum(1 for s in self.original_data['snapshots'] if s['crawl_id'] is None)
  70. # Expected count: original crawls + 1 default crawl if any snapshots had NULL crawl_id
  71. expected_count = len(self.original_data['crawls'])
  72. if snapshots_without_crawl > 0:
  73. expected_count += 1 # Migration 0024 creates a default crawl
  74. ok, msg = verify_crawl_count(self.db_path, expected_count)
  75. self.assertTrue(ok, msg)
  76. def test_migration_preserves_snapshot_crawl_links(self):
  77. """Migration should preserve snapshot-to-crawl relationships and assign default crawl to orphans."""
  78. result = run_archivebox(self.work_dir, ['init'], timeout=45)
  79. self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
  80. conn = sqlite3.connect(str(self.db_path))
  81. cursor = conn.cursor()
  82. # Check EVERY snapshot has a crawl_id after migration
  83. for snapshot in self.original_data['snapshots']:
  84. cursor.execute("SELECT crawl_id FROM core_snapshot WHERE url = ?", (snapshot['url'],))
  85. row = cursor.fetchone()
  86. self.assertIsNotNone(row, f"Snapshot {snapshot['url']} not found after migration")
  87. if snapshot['crawl_id'] is not None:
  88. # Snapshots that had a crawl should keep it
  89. self.assertEqual(row[0], snapshot['crawl_id'],
  90. f"Crawl ID changed for {snapshot['url']}: expected {snapshot['crawl_id']}, got {row[0]}")
  91. else:
  92. # Snapshots without a crawl should now have one (the default crawl)
  93. self.assertIsNotNone(row[0],
  94. f"Snapshot {snapshot['url']} should have been assigned to default crawl but has NULL")
  95. conn.close()
  96. def test_migration_preserves_tags(self):
  97. """Migration should preserve all tags."""
  98. result = run_archivebox(self.work_dir, ['init'], timeout=45)
  99. self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
  100. ok, msg = verify_tag_count(self.db_path, len(self.original_data['tags']))
  101. self.assertTrue(ok, msg)
  102. def test_migration_preserves_archiveresults(self):
  103. """Migration should preserve all archive results."""
  104. expected_count = len(self.original_data['archiveresults'])
  105. result = run_archivebox(self.work_dir, ['init'], timeout=45)
  106. self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
  107. ok, msg = verify_archiveresult_count(self.db_path, expected_count)
  108. self.assertTrue(ok, msg)
  109. def test_migration_preserves_archiveresult_status(self):
  110. """Migration should preserve archive result status values."""
  111. result = run_archivebox(self.work_dir, ['init'], timeout=45)
  112. self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
  113. conn = sqlite3.connect(str(self.db_path))
  114. cursor = conn.cursor()
  115. # Get status counts
  116. cursor.execute("SELECT status, COUNT(*) FROM core_archiveresult GROUP BY status")
  117. status_counts = dict(cursor.fetchall())
  118. conn.close()
  119. # Original data has known status distribution: succeeded, failed, skipped
  120. self.assertIn('succeeded', status_counts, "Should have succeeded results")
  121. self.assertIn('failed', status_counts, "Should have failed results")
  122. self.assertIn('skipped', status_counts, "Should have skipped results")
  123. def test_status_works_after_migration(self):
  124. """Status command should work after migration."""
  125. result = run_archivebox(self.work_dir, ['init'], timeout=45)
  126. self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
  127. result = run_archivebox(self.work_dir, ['status'])
  128. self.assertEqual(result.returncode, 0, f"Status failed after migration: {result.stderr}")
  129. def test_list_works_after_migration(self):
  130. """List command should work and show ALL migrated data."""
  131. result = run_archivebox(self.work_dir, ['init'], timeout=45)
  132. self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
  133. result = run_archivebox(self.work_dir, ['snapshot', 'list'])
  134. self.assertEqual(result.returncode, 0, f"List failed after migration: {result.stderr}")
  135. # Verify ALL snapshots appear in output
  136. output = result.stdout + result.stderr
  137. ok, msg = verify_all_snapshots_in_output(output, self.original_data['snapshots'])
  138. self.assertTrue(ok, msg)
  139. def test_search_works_after_migration(self):
  140. """Search command should find ALL migrated snapshots."""
  141. result = run_archivebox(self.work_dir, ['init'], timeout=45)
  142. self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
  143. result = run_archivebox(self.work_dir, ['search'])
  144. self.assertEqual(result.returncode, 0, f"Search failed after migration: {result.stderr}")
  145. # Verify ALL snapshots appear in output
  146. output = result.stdout + result.stderr
  147. ok, msg = verify_all_snapshots_in_output(output, self.original_data['snapshots'])
  148. self.assertTrue(ok, msg)
  149. def test_migration_preserves_snapshot_titles(self):
  150. """Migration should preserve all snapshot titles."""
  151. expected_titles = {s['url']: s['title'] for s in self.original_data['snapshots']}
  152. result = run_archivebox(self.work_dir, ['init'], timeout=45)
  153. self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
  154. ok, msg = verify_snapshot_titles(self.db_path, expected_titles)
  155. self.assertTrue(ok, msg)
  156. def test_migration_preserves_foreign_keys(self):
  157. """Migration should maintain foreign key relationships."""
  158. result = run_archivebox(self.work_dir, ['init'], timeout=45)
  159. self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
  160. ok, msg = verify_foreign_keys(self.db_path)
  161. self.assertTrue(ok, msg)
  162. def test_migration_removes_seed_id_column(self):
  163. """Migration should remove seed_id column from archivebox.crawls.crawl."""
  164. result = run_archivebox(self.work_dir, ['init'], timeout=45)
  165. self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
  166. conn = sqlite3.connect(str(self.db_path))
  167. cursor = conn.cursor()
  168. cursor.execute("PRAGMA table_info(crawls_crawl)")
  169. columns = [row[1] for row in cursor.fetchall()]
  170. conn.close()
  171. self.assertNotIn('seed_id', columns,
  172. f"seed_id column should have been removed by migration. Columns: {columns}")
  173. def test_migration_removes_seed_table(self):
  174. """Migration should remove crawls_seed table."""
  175. result = run_archivebox(self.work_dir, ['init'], timeout=45)
  176. self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
  177. conn = sqlite3.connect(str(self.db_path))
  178. cursor = conn.cursor()
  179. cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='crawls_seed'")
  180. table_exists = cursor.fetchone() is not None
  181. conn.close()
  182. self.assertFalse(table_exists, "crawls_seed table should have been removed by migration")
  183. def test_add_works_after_migration(self):
  184. """Adding new URLs should work after migration from 0.8.x."""
  185. result = run_archivebox(self.work_dir, ['init'], timeout=45)
  186. # Check that init actually ran and applied migrations
  187. self.assertIn('Applying', result.stdout + result.stderr,
  188. f"Init did not apply migrations. stdout: {result.stdout[:500]}, stderr: {result.stderr[:500]}")
  189. self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
  190. # Count existing crawls
  191. conn = sqlite3.connect(str(self.db_path))
  192. cursor = conn.cursor()
  193. cursor.execute("SELECT COUNT(*) FROM crawls_crawl")
  194. initial_crawl_count = cursor.fetchone()[0]
  195. conn.close()
  196. # Try to add a new URL after migration (use --index-only for speed)
  197. result = run_archivebox(self.work_dir, ['add', '--index-only', 'https://example.com/new-page'], timeout=45)
  198. self.assertEqual(result.returncode, 0, f"Add failed after migration: {result.stderr}")
  199. # Verify a new Crawl was created
  200. conn = sqlite3.connect(str(self.db_path))
  201. cursor = conn.cursor()
  202. cursor.execute("SELECT COUNT(*) FROM crawls_crawl")
  203. new_crawl_count = cursor.fetchone()[0]
  204. conn.close()
  205. self.assertGreater(new_crawl_count, initial_crawl_count,
  206. f"No new Crawl created when adding URL. Add stderr: {result.stderr[-500:]}")
  207. def test_version_works_after_migration(self):
  208. """Version command should work after migration."""
  209. result = run_archivebox(self.work_dir, ['init'], timeout=45)
  210. self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
  211. result = run_archivebox(self.work_dir, ['version'])
  212. self.assertEqual(result.returncode, 0, f"Version failed after migration: {result.stderr}")
  213. # Should show version info
  214. output = result.stdout + result.stderr
  215. self.assertTrue('ArchiveBox' in output or 'version' in output.lower(),
  216. f"Version output missing expected content: {output[:500]}")
  217. def test_migration_creates_process_records(self):
  218. """Migration should create Process records for all ArchiveResults."""
  219. result = run_archivebox(self.work_dir, ['init'], timeout=45)
  220. self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
  221. # Verify Process records created
  222. expected_count = len(self.original_data['archiveresults'])
  223. ok, msg = verify_process_migration(self.db_path, expected_count)
  224. self.assertTrue(ok, msg)
  225. def test_migration_creates_binary_records(self):
  226. """Migration should create Binary records from cmd_version data."""
  227. result = run_archivebox(self.work_dir, ['init'], timeout=45)
  228. self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
  229. conn = sqlite3.connect(str(self.db_path))
  230. cursor = conn.cursor()
  231. # Check Binary records exist
  232. cursor.execute("SELECT COUNT(*) FROM machine_binary")
  233. binary_count = cursor.fetchone()[0]
  234. # Should have at least one binary per unique extractor
  235. extractors = set(ar['extractor'] for ar in self.original_data['archiveresults'])
  236. self.assertGreaterEqual(binary_count, len(extractors),
  237. f"Expected at least {len(extractors)} Binaries, got {binary_count}")
  238. conn.close()
  239. def test_migration_preserves_cmd_data(self):
  240. """Migration should preserve cmd data in Process.cmd field."""
  241. result = run_archivebox(self.work_dir, ['init'], timeout=45)
  242. self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
  243. conn = sqlite3.connect(str(self.db_path))
  244. cursor = conn.cursor()
  245. # Check that Process records have cmd arrays
  246. cursor.execute("SELECT cmd FROM machine_process WHERE cmd != '[]'")
  247. cmd_records = cursor.fetchall()
  248. # All Processes should have non-empty cmd (test data has json.dumps([extractor, '--version']))
  249. expected_count = len(self.original_data['archiveresults'])
  250. self.assertEqual(len(cmd_records), expected_count,
  251. f"Expected {expected_count} Processes with cmd, got {len(cmd_records)}")
  252. conn.close()
  253. class TestMigrationDataIntegrity08x(unittest.TestCase):
  254. """Comprehensive data integrity tests for 0.8.x migrations."""
  255. def test_no_duplicate_snapshots_after_migration(self):
  256. """Migration should not create duplicate snapshots."""
  257. work_dir = Path(tempfile.mkdtemp())
  258. db_path = work_dir / 'index.sqlite3'
  259. try:
  260. create_data_dir_structure(work_dir)
  261. conn = sqlite3.connect(str(db_path))
  262. conn.executescript(SCHEMA_0_8)
  263. conn.close()
  264. seed_0_8_data(db_path)
  265. result = run_archivebox(work_dir, ['init'], timeout=45)
  266. self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
  267. # Check for duplicate URLs
  268. conn = sqlite3.connect(str(db_path))
  269. cursor = conn.cursor()
  270. cursor.execute("""
  271. SELECT url, COUNT(*) as cnt FROM core_snapshot
  272. GROUP BY url HAVING cnt > 1
  273. """)
  274. duplicates = cursor.fetchall()
  275. conn.close()
  276. self.assertEqual(len(duplicates), 0, f"Found duplicate URLs: {duplicates}")
  277. finally:
  278. shutil.rmtree(work_dir, ignore_errors=True)
  279. def test_no_orphaned_archiveresults_after_migration(self):
  280. """Migration should not leave orphaned ArchiveResults."""
  281. work_dir = Path(tempfile.mkdtemp())
  282. db_path = work_dir / 'index.sqlite3'
  283. try:
  284. create_data_dir_structure(work_dir)
  285. conn = sqlite3.connect(str(db_path))
  286. conn.executescript(SCHEMA_0_8)
  287. conn.close()
  288. seed_0_8_data(db_path)
  289. result = run_archivebox(work_dir, ['init'], timeout=45)
  290. self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
  291. ok, msg = verify_foreign_keys(db_path)
  292. self.assertTrue(ok, msg)
  293. finally:
  294. shutil.rmtree(work_dir, ignore_errors=True)
  295. def test_timestamps_preserved_after_migration(self):
  296. """Migration should preserve original timestamps."""
  297. work_dir = Path(tempfile.mkdtemp())
  298. db_path = work_dir / 'index.sqlite3'
  299. try:
  300. create_data_dir_structure(work_dir)
  301. conn = sqlite3.connect(str(db_path))
  302. conn.executescript(SCHEMA_0_8)
  303. conn.close()
  304. original_data = seed_0_8_data(db_path)
  305. original_timestamps = {s['url']: s['timestamp'] for s in original_data['snapshots']}
  306. result = run_archivebox(work_dir, ['init'], timeout=45)
  307. self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
  308. conn = sqlite3.connect(str(db_path))
  309. cursor = conn.cursor()
  310. cursor.execute("SELECT url, timestamp FROM core_snapshot")
  311. migrated_timestamps = {row[0]: row[1] for row in cursor.fetchall()}
  312. conn.close()
  313. for url, original_ts in original_timestamps.items():
  314. self.assertEqual(
  315. migrated_timestamps.get(url), original_ts,
  316. f"Timestamp changed for {url}: {original_ts} -> {migrated_timestamps.get(url)}"
  317. )
  318. finally:
  319. shutil.rmtree(work_dir, ignore_errors=True)
  320. def test_crawl_data_preserved_after_migration(self):
  321. """Migration should preserve crawl metadata (urls, label, status)."""
  322. work_dir = Path(tempfile.mkdtemp())
  323. db_path = work_dir / 'index.sqlite3'
  324. try:
  325. create_data_dir_structure(work_dir)
  326. conn = sqlite3.connect(str(db_path))
  327. conn.executescript(SCHEMA_0_8)
  328. conn.close()
  329. original_data = seed_0_8_data(db_path)
  330. result = run_archivebox(work_dir, ['init'], timeout=45)
  331. self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
  332. conn = sqlite3.connect(str(db_path))
  333. cursor = conn.cursor()
  334. # Check each crawl's data is preserved
  335. for crawl in original_data['crawls']:
  336. cursor.execute("SELECT urls, label FROM crawls_crawl WHERE id = ?", (crawl['id'],))
  337. row = cursor.fetchone()
  338. self.assertIsNotNone(row, f"Crawl {crawl['id']} not found after migration")
  339. self.assertEqual(row[0], crawl['urls'], f"URLs mismatch for crawl {crawl['id']}")
  340. self.assertEqual(row[1], crawl['label'], f"Label mismatch for crawl {crawl['id']}")
  341. conn.close()
  342. finally:
  343. shutil.rmtree(work_dir, ignore_errors=True)
  344. def test_tag_associations_preserved_after_migration(self):
  345. """Migration should preserve snapshot-tag associations."""
  346. work_dir = Path(tempfile.mkdtemp())
  347. db_path = work_dir / 'index.sqlite3'
  348. try:
  349. create_data_dir_structure(work_dir)
  350. conn = sqlite3.connect(str(db_path))
  351. conn.executescript(SCHEMA_0_8)
  352. conn.close()
  353. seed_0_8_data(db_path)
  354. # Count tag associations before migration
  355. conn = sqlite3.connect(str(db_path))
  356. cursor = conn.cursor()
  357. cursor.execute("SELECT COUNT(*) FROM core_snapshot_tags")
  358. original_count = cursor.fetchone()[0]
  359. conn.close()
  360. result = run_archivebox(work_dir, ['init'], timeout=45)
  361. self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
  362. # Count tag associations after migration
  363. conn = sqlite3.connect(str(db_path))
  364. cursor = conn.cursor()
  365. cursor.execute("SELECT COUNT(*) FROM core_snapshot_tags")
  366. migrated_count = cursor.fetchone()[0]
  367. conn.close()
  368. self.assertEqual(migrated_count, original_count,
  369. f"Tag associations changed: {original_count} -> {migrated_count}")
  370. finally:
  371. shutil.rmtree(work_dir, ignore_errors=True)
  372. class TestFilesystemMigration08to09(unittest.TestCase):
  373. """Test filesystem migration from 0.8.x flat structure to 0.9.x organized structure."""
  374. def setUp(self):
  375. """Create a temporary directory for testing."""
  376. self.work_dir = Path(tempfile.mkdtemp())
  377. self.db_path = self.work_dir / 'index.sqlite3'
  378. def tearDown(self):
  379. """Clean up temporary directory."""
  380. shutil.rmtree(self.work_dir, ignore_errors=True)
  381. def test_archiveresult_files_preserved_after_migration(self):
  382. """
  383. Test that ArchiveResult output files are reorganized into new structure.
  384. This test verifies that:
  385. 1. Migration preserves ArchiveResult data in Process/Binary records
  386. 2. Running `archivebox update` reorganizes files into new structure
  387. 3. New structure: users/username/snapshots/YYYYMMDD/example.com/snap-uuid-here/output.ext
  388. 4. All files are moved (no data loss)
  389. 5. Old archive/timestamp/ directories are cleaned up
  390. """
  391. # Use the real 0.7.2 database which has actual ArchiveResults with files
  392. gold_db = Path('/Users/squash/Local/Code/archiveboxes/archivebox-migration-path/archivebox-v0.7.2/data')
  393. if not gold_db.exists():
  394. self.skipTest(f"Gold standard database not found at {gold_db}")
  395. # Copy gold database to test directory
  396. import shutil
  397. for item in gold_db.iterdir():
  398. if item.is_dir():
  399. shutil.copytree(item, self.work_dir / item.name, dirs_exist_ok=True)
  400. else:
  401. shutil.copy2(item, self.work_dir / item.name)
  402. # Count archive directories and files BEFORE migration
  403. archive_dir = self.work_dir / 'archive'
  404. dirs_before = list(archive_dir.glob('*')) if archive_dir.exists() else []
  405. dirs_before_count = len([d for d in dirs_before if d.is_dir()])
  406. # Count total files in all archive directories
  407. files_before = []
  408. for d in dirs_before:
  409. if d.is_dir():
  410. files_before.extend([f for f in d.rglob('*') if f.is_file()])
  411. files_before_count = len(files_before)
  412. # Sample some specific files to check they're preserved
  413. sample_files = [
  414. 'favicon.ico',
  415. 'screenshot.png',
  416. 'singlefile.html',
  417. 'headers.json',
  418. ]
  419. sample_paths_before = {}
  420. for d in dirs_before:
  421. if d.is_dir():
  422. for sample_file in sample_files:
  423. matching = list(d.glob(sample_file))
  424. if matching:
  425. sample_paths_before[f"{d.name}/{sample_file}"] = matching[0]
  426. print(f"\n[*] Archive directories before migration: {dirs_before_count}")
  427. print(f"[*] Total files before migration: {files_before_count}")
  428. print(f"[*] Sample files found: {len(sample_paths_before)}")
  429. # Run init to trigger migration
  430. result = run_archivebox(self.work_dir, ['init'], timeout=60)
  431. self.assertEqual(result.returncode, 0, f"Init (migration) failed: {result.stderr}")
  432. # Count archive directories and files AFTER migration
  433. dirs_after = list(archive_dir.glob('*')) if archive_dir.exists() else []
  434. dirs_after_count = len([d for d in dirs_after if d.is_dir()])
  435. files_after = []
  436. for d in dirs_after:
  437. if d.is_dir():
  438. files_after.extend([f for f in d.rglob('*') if f.is_file()])
  439. files_after_count = len(files_after)
  440. # Verify sample files still exist
  441. sample_paths_after = {}
  442. for d in dirs_after:
  443. if d.is_dir():
  444. for sample_file in sample_files:
  445. matching = list(d.glob(sample_file))
  446. if matching:
  447. sample_paths_after[f"{d.name}/{sample_file}"] = matching[0]
  448. print(f"[*] Archive directories after migration: {dirs_after_count}")
  449. print(f"[*] Total files after migration: {files_after_count}")
  450. print(f"[*] Sample files found: {len(sample_paths_after)}")
  451. # Verify files still in old structure after migration (not moved yet)
  452. self.assertEqual(dirs_before_count, dirs_after_count,
  453. f"Archive directories lost during migration: {dirs_before_count} -> {dirs_after_count}")
  454. self.assertEqual(files_before_count, files_after_count,
  455. f"Files lost during migration: {files_before_count} -> {files_after_count}")
  456. # Run update to trigger filesystem reorganization
  457. print(f"\n[*] Running archivebox update to reorganize filesystem...")
  458. result = run_archivebox(self.work_dir, ['update'], timeout=120)
  459. self.assertEqual(result.returncode, 0, f"Update failed: {result.stderr}")
  460. # Check new filesystem structure
  461. # New structure: users/username/snapshots/YYYYMMDD/example.com/snap-uuid-here/output.ext
  462. users_dir = self.work_dir / 'users'
  463. snapshots_base = None
  464. if users_dir.exists():
  465. # Find the snapshots directory
  466. for user_dir in users_dir.iterdir():
  467. if user_dir.is_dir():
  468. user_snapshots = user_dir / 'snapshots'
  469. if user_snapshots.exists():
  470. snapshots_base = user_snapshots
  471. break
  472. print(f"[*] New structure base: {snapshots_base}")
  473. # Count files in new structure
  474. # Structure: users/{username}/snapshots/YYYYMMDD/{domain}/{uuid}/files...
  475. files_new_structure = []
  476. new_sample_files = {}
  477. if snapshots_base and snapshots_base.exists():
  478. for date_dir in snapshots_base.iterdir():
  479. if date_dir.is_dir():
  480. for domain_dir in date_dir.iterdir():
  481. if domain_dir.is_dir():
  482. for snap_dir in domain_dir.iterdir():
  483. if snap_dir.is_dir():
  484. # Files are directly in snap-uuid/ directory (no plugin subdirs)
  485. for f in snap_dir.rglob('*'):
  486. if f.is_file():
  487. files_new_structure.append(f)
  488. # Track sample files
  489. if f.name in sample_files:
  490. new_sample_files[f"{snap_dir.name}/{f.name}"] = f
  491. files_new_count = len(files_new_structure)
  492. print(f"[*] Files in new structure: {files_new_count}")
  493. print(f"[*] Sample files in new structure: {len(new_sample_files)}")
  494. # Check old structure (should be gone or empty)
  495. old_archive_dir = self.work_dir / 'archive'
  496. old_files_remaining = []
  497. unmigrated_dirs = []
  498. if old_archive_dir.exists():
  499. for d in old_archive_dir.glob('*'):
  500. # Only count REAL directories, not symlinks (symlinks are the migrated ones)
  501. if d.is_dir(follow_symlinks=False) and d.name.replace('.', '').isdigit():
  502. # This is a timestamp directory (old structure)
  503. files_in_dir = [f for f in d.rglob('*') if f.is_file()]
  504. if files_in_dir:
  505. unmigrated_dirs.append((d.name, len(files_in_dir)))
  506. old_files_remaining.extend(files_in_dir)
  507. old_files_count = len(old_files_remaining)
  508. print(f"[*] Files remaining in old structure: {old_files_count}")
  509. if unmigrated_dirs:
  510. print(f"[*] Unmigrated directories: {unmigrated_dirs}")
  511. # CRITICAL: Verify files were moved to new structure
  512. self.assertGreater(files_new_count, 0,
  513. "No files found in new structure after update")
  514. # CRITICAL: Verify old structure is cleaned up
  515. self.assertEqual(old_files_count, 0,
  516. f"Old structure not cleaned up: {old_files_count} files still in archive/timestamp/ directories")
  517. # CRITICAL: Verify all files were moved (total count should match)
  518. total_after_update = files_new_count + old_files_count
  519. self.assertEqual(files_before_count, total_after_update,
  520. f"Files lost during reorganization: {files_before_count} before → {total_after_update} after")
  521. # CRITICAL: Verify sample files exist in new structure
  522. self.assertGreater(len(new_sample_files), 0,
  523. f"Sample files not found in new structure")
  524. # Verify new path format
  525. for path_key, file_path in new_sample_files.items():
  526. # Path should contain: snapshots/YYYYMMDD/domain/snap-uuid/plugin/file
  527. path_parts = file_path.parts
  528. self.assertIn('snapshots', path_parts,
  529. f"New path should contain 'snapshots': {file_path}")
  530. self.assertIn('users', path_parts,
  531. f"New path should contain 'users': {file_path}")
  532. print(f" ✓ {path_key} → {file_path.relative_to(self.work_dir)}")
  533. # Verify Process and Binary records were created
  534. conn = sqlite3.connect(str(self.db_path))
  535. cursor = conn.cursor()
  536. cursor.execute("SELECT COUNT(*) FROM core_archiveresult")
  537. archiveresult_count = cursor.fetchone()[0]
  538. cursor.execute("SELECT COUNT(*) FROM machine_process")
  539. process_count = cursor.fetchone()[0]
  540. cursor.execute("SELECT COUNT(*) FROM machine_binary")
  541. binary_count = cursor.fetchone()[0]
  542. cursor.execute("SELECT COUNT(*) FROM core_archiveresult WHERE process_id IS NOT NULL")
  543. linked_count = cursor.fetchone()[0]
  544. conn.close()
  545. print(f"[*] ArchiveResults: {archiveresult_count}")
  546. print(f"[*] Process records created: {process_count}")
  547. print(f"[*] Binary records created: {binary_count}")
  548. print(f"[*] ArchiveResults linked to Process: {linked_count}")
  549. # Verify data migration happened correctly
  550. # The 0.7.2 gold database has 44 ArchiveResults
  551. self.assertEqual(archiveresult_count, 44,
  552. f"Expected 44 ArchiveResults from 0.7.2 database, got {archiveresult_count}")
  553. # Each ArchiveResult should create one Process record
  554. self.assertEqual(process_count, 44,
  555. f"Expected 44 Process records (1 per ArchiveResult), got {process_count}")
  556. # The 44 ArchiveResults use 7 unique binaries (curl, wget, etc.)
  557. self.assertEqual(binary_count, 7,
  558. f"Expected 7 unique Binary records, got {binary_count}")
  559. # ALL ArchiveResults should be linked to Process records
  560. self.assertEqual(linked_count, 44,
  561. f"Expected all 44 ArchiveResults linked to Process, got {linked_count}")
  562. if __name__ == '__main__':
  563. unittest.main()