0007_archiveresult.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. # Generated by Django 3.0.8 on 2020-11-04 12:25
  2. import json
  3. from pathlib import Path
  4. from django.db import migrations, models
  5. import django.db.models.deletion
  6. from config import CONFIG
  7. from index.json import to_json
  8. try:
  9. JSONField = models.JSONField
  10. except AttributeError:
  11. import jsonfield
  12. JSONField = jsonfield.JSONField
  13. def forwards_func(apps, schema_editor):
  14. Snapshot = apps.get_model("core", "Snapshot")
  15. ArchiveResult = apps.get_model("core", "ArchiveResult")
  16. snapshots = Snapshot.objects.all()
  17. for snapshot in snapshots:
  18. out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
  19. try:
  20. with open(out_dir / "index.json", "r") as f:
  21. fs_index = json.load(f)
  22. except Exception as e:
  23. continue
  24. history = fs_index["history"]
  25. for extractor in history:
  26. for result in history[extractor]:
  27. try:
  28. ArchiveResult.objects.create(
  29. extractor=extractor,
  30. snapshot=snapshot,
  31. pwd=result["pwd"],
  32. cmd=result.get("cmd") or [],
  33. cmd_version=result.get("cmd_version") or 'unknown',
  34. start_ts=result["start_ts"],
  35. end_ts=result["end_ts"],
  36. status=result["status"],
  37. output=result.get("output") or 'null',
  38. )
  39. except Exception as e:
  40. print(
  41. ' ! Skipping import due to missing/invalid index.json:',
  42. out_dir,
  43. e,
  44. '(open an issue with this index.json for help)',
  45. )
  46. def verify_json_index_integrity(snapshot):
  47. results = snapshot.archiveresult_set.all()
  48. out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
  49. with open(out_dir / "index.json", "r") as f:
  50. index = json.load(f)
  51. history = index["history"]
  52. index_results = [result for extractor in history for result in history[extractor]]
  53. flattened_results = [result["start_ts"] for result in index_results]
  54. missing_results = [result for result in results if result.start_ts.isoformat() not in flattened_results]
  55. for missing in missing_results:
  56. index["history"][missing.extractor].append({"cmd": missing.cmd, "cmd_version": missing.cmd_version, "end_ts": missing.end_ts.isoformat(),
  57. "start_ts": missing.start_ts.isoformat(), "pwd": missing.pwd, "output": missing.output,
  58. "schema": "ArchiveResult", "status": missing.status})
  59. json_index = to_json(index)
  60. with open(out_dir / "index.json", "w") as f:
  61. f.write(json_index)
  62. def reverse_func(apps, schema_editor):
  63. Snapshot = apps.get_model("core", "Snapshot")
  64. ArchiveResult = apps.get_model("core", "ArchiveResult")
  65. for snapshot in Snapshot.objects.all():
  66. verify_json_index_integrity(snapshot)
  67. ArchiveResult.objects.all().delete()
  68. class Migration(migrations.Migration):
  69. dependencies = [
  70. ('core', '0006_auto_20201012_1520'),
  71. ]
  72. operations = [
  73. migrations.CreateModel(
  74. name='ArchiveResult',
  75. fields=[
  76. ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
  77. ('cmd', JSONField()),
  78. ('pwd', models.CharField(max_length=256)),
  79. ('cmd_version', models.CharField(max_length=32)),
  80. ('status', models.CharField(choices=[('succeeded', 'succeeded'), ('failed', 'failed'), ('skipped', 'skipped')], max_length=16)),
  81. ('output', models.CharField(max_length=512)),
  82. ('start_ts', models.DateTimeField()),
  83. ('end_ts', models.DateTimeField()),
  84. ('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archive_org', 'archive_org')], max_length=32)),
  85. ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')),
  86. ],
  87. ),
  88. migrations.RunPython(forwards_func, reverse_func),
  89. ]