0007_archiveresult.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. # Generated by Django 3.0.8 on 2020-11-04 12:25
  2. import json
  3. from pathlib import Path
  4. from django.db import migrations, models
  5. import django.db.models.deletion
  6. from config import CONFIG
  7. from index.json import to_json
  8. try:
  9. JSONField = models.JSONField
  10. except AttributeError:
  11. import jsonfield
  12. JSONField = jsonfield.JSONField
  13. def forwards_func(apps, schema_editor):
  14. from core.models import EXTRACTORS
  15. Snapshot = apps.get_model("core", "Snapshot")
  16. ArchiveResult = apps.get_model("core", "ArchiveResult")
  17. snapshots = Snapshot.objects.all()
  18. for snapshot in snapshots:
  19. out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
  20. try:
  21. with open(out_dir / "index.json", "r") as f:
  22. fs_index = json.load(f)
  23. except Exception as e:
  24. continue
  25. history = fs_index["history"]
  26. for extractor in history:
  27. for result in history[extractor]:
  28. try:
  29. ArchiveResult.objects.create(
  30. extractor=extractor,
  31. snapshot=snapshot,
  32. pwd=result["pwd"],
  33. cmd=result.get("cmd") or [],
  34. cmd_version=result.get("cmd_version") or 'unknown',
  35. start_ts=result["start_ts"],
  36. end_ts=result["end_ts"],
  37. status=result["status"],
  38. output=result.get("output") or 'null',
  39. )
  40. except Exception as e:
  41. print(
  42. ' ! Skipping import due to missing/invalid index.json:',
  43. out_dir,
  44. e,
  45. '(open an issue with this index.json for help)',
  46. )
  47. def verify_json_index_integrity(snapshot):
  48. results = snapshot.archiveresult_set.all()
  49. out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
  50. with open(out_dir / "index.json", "r") as f:
  51. index = json.load(f)
  52. history = index["history"]
  53. index_results = [result for extractor in history for result in history[extractor]]
  54. flattened_results = [result["start_ts"] for result in index_results]
  55. missing_results = [result for result in results if result.start_ts.isoformat() not in flattened_results]
  56. for missing in missing_results:
  57. index["history"][missing.extractor].append({"cmd": missing.cmd, "cmd_version": missing.cmd_version, "end_ts": missing.end_ts.isoformat(),
  58. "start_ts": missing.start_ts.isoformat(), "pwd": missing.pwd, "output": missing.output,
  59. "schema": "ArchiveResult", "status": missing.status})
  60. json_index = to_json(index)
  61. with open(out_dir / "index.json", "w") as f:
  62. f.write(json_index)
  63. def reverse_func(apps, schema_editor):
  64. Snapshot = apps.get_model("core", "Snapshot")
  65. ArchiveResult = apps.get_model("core", "ArchiveResult")
  66. for snapshot in Snapshot.objects.all():
  67. verify_json_index_integrity(snapshot)
  68. ArchiveResult.objects.all().delete()
  69. class Migration(migrations.Migration):
  70. dependencies = [
  71. ('core', '0006_auto_20201012_1520'),
  72. ]
  73. operations = [
  74. migrations.CreateModel(
  75. name='ArchiveResult',
  76. fields=[
  77. ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
  78. ('cmd', JSONField()),
  79. ('pwd', models.CharField(max_length=256)),
  80. ('cmd_version', models.CharField(max_length=32)),
  81. ('status', models.CharField(choices=[('succeeded', 'succeeded'), ('failed', 'failed'), ('skipped', 'skipped')], max_length=16)),
  82. ('output', models.CharField(max_length=512)),
  83. ('start_ts', models.DateTimeField()),
  84. ('end_ts', models.DateTimeField()),
  85. ('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archive_org', 'archive_org')], max_length=32)),
  86. ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')),
  87. ],
  88. ),
  89. migrations.RunPython(forwards_func, reverse_func),
  90. ]