utils.py 4.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. from django.utils.html import format_html
  2. from core.models import Snapshot, EXTRACTORS
  3. from pathlib import Path
  4. def get_icons(snapshot: Snapshot) -> str:
  5. archive_results = snapshot.archiveresult_set
  6. link = snapshot.as_link()
  7. canon = link.canonical_outputs()
  8. output = ""
  9. output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{} </a>'
  10. icons = {
  11. "singlefile": "❶",
  12. "wget": "🆆",
  13. "dom": "🅷",
  14. "pdf": "📄",
  15. "screenshot": "💻",
  16. "media": "📼",
  17. "git": "🅶",
  18. "archive_org": "🏛",
  19. "readability": "🆁",
  20. "mercury": "🅼",
  21. "warc": "📦"
  22. }
  23. exclude = ["favicon", "archive_org"]
  24. # Missing specific entry for WARC
  25. for extractor, _ in EXTRACTORS:
  26. result = archive_results.filter(extractor=extractor, status="succeeded")
  27. path, exists = link.archive_path, result.exists()
  28. try:
  29. if extractor not in exclude:
  30. output += output_template.format(path, canon[f"{extractor}_path"],
  31. exists, extractor, icons.get(extractor, "?"))
  32. if extractor == "wget":
  33. # warc isn't technically it's own extractor, so we have to add it after wget
  34. exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
  35. if exists:
  36. output += output_template.format(exists[0], "",
  37. True, "warc", icons.get("warc", "?"))
  38. if extractor == "archive_org" and exists:
  39. # The check for archive_org is different, so it has to be handled separately
  40. target_path = Path(path) / "archive.org.txt"
  41. exists = target_path.exists()
  42. if exists:
  43. output += '<a href="{}" class="exists-{}" title="{}">{} </a>'.format(canon["archive_org_path"],
  44. True, "archive_org", icons.get("archive_org", "?"))
  45. except Exception as e:
  46. print(e)
  47. return format_html(f'<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">{output}<span>')
  48. #def get_icons(snapshot: Snapshot) -> str:
  49. # link = snapshot.as_link()
  50. # canon = link.canonical_outputs()
  51. # out_dir = Path(link.link_dir)
  52. #
  53. # # slow version: highlights icons based on whether files exist or not for that output
  54. # # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
  55. # # fast version: all icons are highlighted without checking for outputs in filesystem
  56. # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
  57. #
  58. # return format_html(
  59. # '<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">'
  60. # '<a href="/{}/{}" class="exists-{}" title="SingleFile">❶ </a>'
  61. # '<a href="/{}/{}" class="exists-{}" title="Wget clone">🆆 </a> '
  62. # '<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '
  63. # '<a href="/{}/{}" class="exists-{}" title="PDF">📄 </a> '
  64. # '<a href="/{}/{}" class="exists-{}" title="Screenshot">💻 </a> '
  65. # '<a href="/{}/{}" class="exists-{}" title="WARC">📦 </a> '
  66. # '<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '
  67. # '<a href="/{}/{}/" class="exists-{}" title="Git repos">🅶 </a> '
  68. # '<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> '
  69. # '</span>',
  70. # *link_tuple(link, 'singlefile_path'),
  71. # *link_tuple(link, 'wget_path')[:2], any((out_dir / link.domain).glob('*')),
  72. # *link_tuple(link, 'pdf_path'),
  73. # *link_tuple(link, 'screenshot_path'),
  74. # *link_tuple(link, 'dom_path'),
  75. # *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
  76. # *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
  77. # *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
  78. # canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),
  79. # )
  80. #