0007_archiveresult.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. # Generated by Django 3.0.8 on 2020-11-04 12:25
  2. import os
  3. import json
  4. from pathlib import Path
  5. from django.db import migrations, models
  6. import django.db.models.deletion
  7. from index.json import to_json
  8. DATA_DIR = Path(os.getcwd()).resolve() # archivebox user data dir
  9. ARCHIVE_DIR = DATA_DIR / 'archive' # archivebox snapshot data dir
  10. try:
  11. JSONField = models.JSONField
  12. except AttributeError:
  13. import jsonfield
  14. JSONField = jsonfield.JSONField
  15. def forwards_func(apps, schema_editor):
  16. Snapshot = apps.get_model("core", "Snapshot")
  17. ArchiveResult = apps.get_model("core", "ArchiveResult")
  18. snapshots = Snapshot.objects.all()
  19. for snapshot in snapshots:
  20. out_dir = ARCHIVE_DIR / snapshot.timestamp
  21. try:
  22. with open(out_dir / "index.json", "r") as f:
  23. fs_index = json.load(f)
  24. except Exception as e:
  25. continue
  26. history = fs_index["history"]
  27. for extractor in history:
  28. for result in history[extractor]:
  29. try:
  30. ArchiveResult.objects.create(
  31. extractor=extractor,
  32. snapshot=snapshot,
  33. pwd=result["pwd"],
  34. cmd=result.get("cmd") or [],
  35. cmd_version=result.get("cmd_version") or 'unknown',
  36. start_ts=result["start_ts"],
  37. end_ts=result["end_ts"],
  38. status=result["status"],
  39. output=result.get("output") or 'null',
  40. )
  41. except Exception as e:
  42. print(
  43. ' ! Skipping import due to missing/invalid index.json:',
  44. out_dir,
  45. e,
  46. '(open an issue with this index.json for help)',
  47. )
  48. def verify_json_index_integrity(snapshot):
  49. results = snapshot.archiveresult_set.all()
  50. out_dir = ARCHIVE_DIR / snapshot.timestamp
  51. with open(out_dir / "index.json", "r") as f:
  52. index = json.load(f)
  53. history = index["history"]
  54. index_results = [result for extractor in history for result in history[extractor]]
  55. flattened_results = [result["start_ts"] for result in index_results]
  56. missing_results = [result for result in results if result.start_ts.isoformat() not in flattened_results]
  57. for missing in missing_results:
  58. index["history"][missing.extractor].append({"cmd": missing.cmd, "cmd_version": missing.cmd_version, "end_ts": missing.end_ts.isoformat(),
  59. "start_ts": missing.start_ts.isoformat(), "pwd": missing.pwd, "output": missing.output,
  60. "schema": "ArchiveResult", "status": missing.status})
  61. json_index = to_json(index)
  62. with open(out_dir / "index.json", "w") as f:
  63. f.write(json_index)
  64. def reverse_func(apps, schema_editor):
  65. Snapshot = apps.get_model("core", "Snapshot")
  66. ArchiveResult = apps.get_model("core", "ArchiveResult")
  67. for snapshot in Snapshot.objects.all():
  68. verify_json_index_integrity(snapshot)
  69. ArchiveResult.objects.all().delete()
  70. class Migration(migrations.Migration):
  71. dependencies = [
  72. ('core', '0006_auto_20201012_1520'),
  73. ]
  74. operations = [
  75. migrations.CreateModel(
  76. name='ArchiveResult',
  77. fields=[
  78. ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
  79. ('cmd', JSONField()),
  80. ('pwd', models.CharField(max_length=256)),
  81. ('cmd_version', models.CharField(max_length=32)),
  82. ('status', models.CharField(choices=[('succeeded', 'succeeded'), ('failed', 'failed'), ('skipped', 'skipped')], max_length=16)),
  83. ('output', models.CharField(max_length=512)),
  84. ('start_ts', models.DateTimeField()),
  85. ('end_ts', models.DateTimeField()),
  86. ('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archive_org', 'archive_org')], max_length=32)),
  87. ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')),
  88. ],
  89. ),
  90. migrations.RunPython(forwards_func, reverse_func),
  91. ]