| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116 |
- # Generated by Django 3.0.8 on 2020-11-04 12:25
- import os
- import json
- from pathlib import Path
- from django.db import migrations, models
- import django.db.models.deletion
- from index.json import to_json
- DATA_DIR = Path(os.getcwd()).resolve() # archivebox user data dir
- ARCHIVE_DIR = DATA_DIR / 'archive' # archivebox snapshot data dir
- try:
- JSONField = models.JSONField
- except AttributeError:
- import jsonfield
- JSONField = jsonfield.JSONField
- def forwards_func(apps, schema_editor):
- Snapshot = apps.get_model("core", "Snapshot")
- ArchiveResult = apps.get_model("core", "ArchiveResult")
- snapshots = Snapshot.objects.all()
- for snapshot in snapshots:
- out_dir = ARCHIVE_DIR / snapshot.timestamp
- try:
- with open(out_dir / "index.json", "r") as f:
- fs_index = json.load(f)
- except Exception as e:
- continue
- history = fs_index["history"]
- for extractor in history:
- for result in history[extractor]:
- try:
- ArchiveResult.objects.create(
- extractor=extractor,
- snapshot=snapshot,
- pwd=result["pwd"],
- cmd=result.get("cmd") or [],
- cmd_version=result.get("cmd_version") or 'unknown',
- start_ts=result["start_ts"],
- end_ts=result["end_ts"],
- status=result["status"],
- output=result.get("output") or 'null',
- )
- except Exception as e:
- print(
- ' ! Skipping import due to missing/invalid index.json:',
- out_dir,
- e,
- '(open an issue with this index.json for help)',
- )
- def verify_json_index_integrity(snapshot):
- results = snapshot.archiveresult_set.all()
- out_dir = ARCHIVE_DIR / snapshot.timestamp
- with open(out_dir / "index.json", "r") as f:
- index = json.load(f)
- history = index["history"]
- index_results = [result for extractor in history for result in history[extractor]]
- flattened_results = [result["start_ts"] for result in index_results]
-
- missing_results = [result for result in results if result.start_ts.isoformat() not in flattened_results]
- for missing in missing_results:
- index["history"][missing.extractor].append({"cmd": missing.cmd, "cmd_version": missing.cmd_version, "end_ts": missing.end_ts.isoformat(),
- "start_ts": missing.start_ts.isoformat(), "pwd": missing.pwd, "output": missing.output,
- "schema": "ArchiveResult", "status": missing.status})
- json_index = to_json(index)
- with open(out_dir / "index.json", "w") as f:
- f.write(json_index)
- def reverse_func(apps, schema_editor):
- Snapshot = apps.get_model("core", "Snapshot")
- ArchiveResult = apps.get_model("core", "ArchiveResult")
- for snapshot in Snapshot.objects.all():
- verify_json_index_integrity(snapshot)
- ArchiveResult.objects.all().delete()
- class Migration(migrations.Migration):
- dependencies = [
- ('core', '0006_auto_20201012_1520'),
- ]
- operations = [
- migrations.CreateModel(
- name='ArchiveResult',
- fields=[
- ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
- ('cmd', JSONField()),
- ('pwd', models.CharField(max_length=256)),
- ('cmd_version', models.CharField(max_length=32)),
- ('status', models.CharField(choices=[('succeeded', 'succeeded'), ('failed', 'failed'), ('skipped', 'skipped')], max_length=16)),
- ('output', models.CharField(max_length=512)),
- ('start_ts', models.DateTimeField()),
- ('end_ts', models.DateTimeField()),
- ('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archive_org', 'archive_org')], max_length=32)),
- ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')),
- ],
- ),
- migrations.RunPython(forwards_func, reverse_func),
- ]
|