0007_archiveresult.py 3.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. # Generated by Django 3.0.8 on 2020-11-04 12:25
  2. import json
  3. from pathlib import Path
  4. from django.db import migrations, models
  5. import django.db.models.deletion
  6. from config import CONFIG
  7. from index.json import to_json
  8. try:
  9. JSONField = models.JSONField
  10. except AttributeError:
  11. import jsonfield
  12. JSONField = jsonfield.JSONField
  13. def forwards_func(apps, schema_editor):
  14. from core.models import EXTRACTORS
  15. Snapshot = apps.get_model("core", "Snapshot")
  16. ArchiveResult = apps.get_model("core", "ArchiveResult")
  17. snapshots = Snapshot.objects.all()
  18. for snapshot in snapshots:
  19. out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
  20. try:
  21. with open(out_dir / "index.json", "r") as f:
  22. fs_index = json.load(f)
  23. except Exception as e:
  24. continue
  25. history = fs_index["history"]
  26. for extractor in history:
  27. for result in history[extractor]:
  28. ArchiveResult.objects.create(extractor=extractor, snapshot=snapshot, cmd=result["cmd"], cmd_version=result["cmd_version"] or 'unknown',
  29. start_ts=result["start_ts"], end_ts=result["end_ts"], status=result["status"], pwd=result["pwd"], output=result["output"])
  30. def verify_json_index_integrity(snapshot):
  31. results = snapshot.archiveresult_set.all()
  32. out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
  33. with open(out_dir / "index.json", "r") as f:
  34. index = json.load(f)
  35. history = index["history"]
  36. index_results = [result for extractor in history for result in history[extractor]]
  37. flattened_results = [result["start_ts"] for result in index_results]
  38. missing_results = [result for result in results if result.start_ts.isoformat() not in flattened_results]
  39. for missing in missing_results:
  40. index["history"][missing.extractor].append({"cmd": missing.cmd, "cmd_version": missing.cmd_version, "end_ts": missing.end_ts.isoformat(),
  41. "start_ts": missing.start_ts.isoformat(), "pwd": missing.pwd, "output": missing.output,
  42. "schema": "ArchiveResult", "status": missing.status})
  43. json_index = to_json(index)
  44. with open(out_dir / "index.json", "w") as f:
  45. f.write(json_index)
  46. def reverse_func(apps, schema_editor):
  47. Snapshot = apps.get_model("core", "Snapshot")
  48. ArchiveResult = apps.get_model("core", "ArchiveResult")
  49. for snapshot in Snapshot.objects.all():
  50. verify_json_index_integrity(snapshot)
  51. ArchiveResult.objects.all().delete()
  52. class Migration(migrations.Migration):
  53. dependencies = [
  54. ('core', '0006_auto_20201012_1520'),
  55. ]
  56. operations = [
  57. migrations.CreateModel(
  58. name='ArchiveResult',
  59. fields=[
  60. ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
  61. ('cmd', JSONField()),
  62. ('pwd', models.CharField(max_length=256)),
  63. ('cmd_version', models.CharField(max_length=32)),
  64. ('status', models.CharField(choices=[('succeeded', 'succeeded'), ('failed', 'failed'), ('skipped', 'skipped')], max_length=16)),
  65. ('output', models.CharField(max_length=512)),
  66. ('start_ts', models.DateTimeField()),
  67. ('end_ts', models.DateTimeField()),
  68. ('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archive_org', 'archive_org')], max_length=32)),
  69. ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')),
  70. ],
  71. ),
  72. migrations.RunPython(forwards_func, reverse_func),
  73. ]