html.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. __package__ = 'archivebox.index'
  2. from datetime import datetime
  3. from typing import List, Optional, Iterator, Mapping
  4. from pathlib import Path
  5. from django.utils.html import format_html, mark_safe
  6. from collections import defaultdict
  7. from .schema import Link
  8. from ..system import atomic_write
  9. from ..logging_util import printable_filesize
  10. from ..util import (
  11. enforce_types,
  12. ts_to_date,
  13. urlencode,
  14. htmlencode,
  15. urldecode,
  16. )
  17. from ..config import (
  18. OUTPUT_DIR,
  19. VERSION,
  20. GIT_SHA,
  21. FOOTER_INFO,
  22. HTML_INDEX_FILENAME,
  23. )
  24. MAIN_INDEX_TEMPLATE = 'static_index.html'
  25. MINIMAL_INDEX_TEMPLATE = 'minimal_index.html'
  26. LINK_DETAILS_TEMPLATE = 'snapshot.html'
  27. TITLE_LOADING_MSG = 'Not yet archived...'
  28. ### Main Links Index
  29. @enforce_types
  30. def parse_html_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[str]:
  31. """parse an archive index html file and return the list of urls"""
  32. index_path = Path(out_dir) / HTML_INDEX_FILENAME
  33. if index_path.exists():
  34. with open(index_path, 'r', encoding='utf-8') as f:
  35. for line in f:
  36. if 'class="link-url"' in line:
  37. yield line.split('"')[1]
  38. return ()
  39. @enforce_types
  40. def generate_index_from_links(links: List[Link], with_headers: bool):
  41. if with_headers:
  42. output = main_index_template(links)
  43. else:
  44. output = main_index_template(links, template=MINIMAL_INDEX_TEMPLATE)
  45. return output
  46. @enforce_types
  47. def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) -> str:
  48. """render the template for the entire main index"""
  49. return render_django_template(template, {
  50. 'version': VERSION,
  51. 'git_sha': GIT_SHA,
  52. 'num_links': str(len(links)),
  53. 'date_updated': datetime.now().strftime('%Y-%m-%d'),
  54. 'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
  55. 'links': [link._asdict(extended=True) for link in links],
  56. 'FOOTER_INFO': FOOTER_INFO,
  57. })
  58. ### Link Details Index
  59. @enforce_types
  60. def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
  61. out_dir = out_dir or link.link_dir
  62. rendered_html = link_details_template(link)
  63. atomic_write(str(Path(out_dir) / HTML_INDEX_FILENAME), rendered_html)
  64. @enforce_types
  65. def link_details_template(link: Link) -> str:
  66. from ..extractors.wget import wget_output_path
  67. link_info = link._asdict(extended=True)
  68. return render_django_template(LINK_DETAILS_TEMPLATE, {
  69. **link_info,
  70. **link_info['canonical'],
  71. 'title': htmlencode(
  72. link.title
  73. or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
  74. ),
  75. 'url_str': htmlencode(urldecode(link.base_url)),
  76. 'archive_url': urlencode(
  77. wget_output_path(link)
  78. or (link.domain if link.is_archived else '')
  79. ) or 'about:blank',
  80. 'extension': link.extension or 'html',
  81. 'tags': link.tags or 'untagged',
  82. 'size': printable_filesize(link.archive_size) if link.archive_size else 'pending',
  83. 'status': 'archived' if link.is_archived else 'not yet archived',
  84. 'status_color': 'success' if link.is_archived else 'danger',
  85. 'oldest_archive_date': ts_to_date(link.oldest_archive_date),
  86. })
  87. @enforce_types
  88. def render_django_template(template: str, context: Mapping[str, str]) -> str:
  89. """render a given html template string with the given template content"""
  90. from django.template.loader import render_to_string
  91. return render_to_string(template, context)
  92. def snapshot_icons(snapshot) -> str:
  93. from core.models import EXTRACTORS
  94. archive_results = snapshot.archiveresult_set.filter(status="succeeded")
  95. link = snapshot.as_link()
  96. path = link.archive_path
  97. canon = link.canonical_outputs()
  98. output = ""
  99. output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> &nbsp;'
  100. icons = {
  101. "singlefile": "❶",
  102. "wget": "🆆",
  103. "dom": "🅷",
  104. "pdf": "📄",
  105. "screenshot": "💻",
  106. "media": "📼",
  107. "git": "🅶",
  108. "archive_org": "🏛",
  109. "readability": "🆁",
  110. "mercury": "🅼",
  111. "warc": "📦"
  112. }
  113. exclude = ["favicon", "title", "headers", "archive_org"]
  114. # Missing specific entry for WARC
  115. extractor_items = defaultdict(lambda: None)
  116. for extractor, _ in EXTRACTORS:
  117. for result in archive_results:
  118. if result.extractor == extractor:
  119. extractor_items[extractor] = result
  120. for extractor, _ in EXTRACTORS:
  121. if extractor not in exclude:
  122. exists = False
  123. if extractor_items[extractor] is not None:
  124. outpath = (Path(path) / canon[f"{extractor}_path"])
  125. if outpath.is_dir():
  126. exists = any(outpath.glob('*.*'))
  127. elif outpath.is_file():
  128. exists = outpath.stat().st_size > 100
  129. output += format_html(output_template, path, canon[f"{extractor}_path"], str(exists),
  130. extractor, icons.get(extractor, "?"))
  131. if extractor == "wget":
  132. # warc isn't technically it's own extractor, so we have to add it after wget
  133. exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
  134. output += format_html(output_template, exists[0] if exists else '#', canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
  135. if extractor == "archive_org":
  136. # The check for archive_org is different, so it has to be handled separately
  137. target_path = Path(path) / "archive.org.txt"
  138. exists = target_path.exists()
  139. output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon["archive_org_path"], str(exists),
  140. "archive_org", icons.get("archive_org", "?"))
  141. return format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))