html.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
  1. __package__ = 'archivebox.index'
  2. from datetime import datetime
  3. from typing import List, Optional, Iterator, Mapping
  4. from pathlib import Path
  5. from django.db.models import Model
  6. from django.utils.html import format_html, mark_safe
  7. from collections import defaultdict
  8. from .schema import Link
  9. from ..system import atomic_write
  10. from ..logging_util import printable_filesize
  11. from ..util import (
  12. enforce_types,
  13. ts_to_date,
  14. urlencode,
  15. htmlencode,
  16. urldecode,
  17. )
  18. from ..config import (
  19. OUTPUT_DIR,
  20. VERSION,
  21. GIT_SHA,
  22. FOOTER_INFO,
  23. HTML_INDEX_FILENAME,
  24. SAVE_ARCHIVE_DOT_ORG,
  25. )
  26. MAIN_INDEX_TEMPLATE = 'static_index.html'
  27. MINIMAL_INDEX_TEMPLATE = 'minimal_index.html'
  28. LINK_DETAILS_TEMPLATE = 'snapshot.html'
  29. TITLE_LOADING_MSG = 'Not yet archived...'
  30. ### Main Links Index
  31. @enforce_types
  32. def parse_html_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[str]:
  33. """parse an archive index html file and return the list of urls"""
  34. index_path = Path(out_dir) / HTML_INDEX_FILENAME
  35. if index_path.exists():
  36. with open(index_path, 'r', encoding='utf-8') as f:
  37. for line in f:
  38. if 'class="link-url"' in line:
  39. yield line.split('"')[1]
  40. return ()
  41. @enforce_types
  42. def generate_index_from_snapshots(snapshots: List[Model], with_headers: bool):
  43. if with_headers:
  44. output = main_index_template(snapshots)
  45. else:
  46. output = main_index_template(snapshots, template=MINIMAL_INDEX_TEMPLATE)
  47. return output
  48. @enforce_types
  49. def main_index_template(snapshots: List[Model], template: str=MAIN_INDEX_TEMPLATE) -> str:
  50. """render the template for the entire main index"""
  51. return render_django_template(template, {
  52. 'version': VERSION,
  53. 'git_sha': GIT_SHA,
  54. 'num_snapshots': str(len(snapshots)),
  55. 'date_updated': datetime.now().strftime('%Y-%m-%d'),
  56. 'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
  57. 'snapshots': snapshots,
  58. 'FOOTER_INFO': FOOTER_INFO,
  59. })
  60. ### Link Details Index
  61. @enforce_types
  62. def write_html_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> None:
  63. out_dir = out_dir or snapshot.snapshot_dir
  64. rendered_html = snapshot_details_template(snapshot)
  65. atomic_write(str(Path(out_dir) / HTML_INDEX_FILENAME), rendered_html)
  66. @enforce_types
  67. def snapshot_details_template(snapshot: Model) -> str:
  68. from ..extractors.wget import wget_output_path
  69. return render_django_template(LINK_DETAILS_TEMPLATE, {
  70. **snapshot.as_json(),
  71. **snapshot.canonical_outputs(),
  72. 'title': htmlencode(
  73. snapshot.title
  74. or (snapshot.base_url if snapshot.is_archived else TITLE_LOADING_MSG)
  75. ),
  76. 'url_str': htmlencode(urldecode(snapshot.base_url)),
  77. 'archive_url': urlencode(
  78. wget_output_path(snapshot)
  79. or (snapshot.domain if snapshot.is_archived else '')
  80. ) or 'about:blank',
  81. 'extension': snapshot.extension or 'html',
  82. 'tags': snapshot.tags_str() or 'untagged',
  83. 'size': printable_filesize(snapshot.archive_size) if snapshot.archive_size else 'pending',
  84. 'status': 'archived' if snapshot.is_archived else 'not yet archived',
  85. 'status_color': 'success' if snapshot.is_archived else 'danger',
  86. 'oldest_archive_date': ts_to_date(snapshot.oldest_archive_date),
  87. 'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
  88. })
  89. @enforce_types
  90. def render_django_template(template: str, context: Mapping[str, str]) -> str:
  91. """render a given html template string with the given template content"""
  92. from django.template.loader import render_to_string
  93. return render_to_string(template, context)
  94. def snapshot_icons(snapshot) -> str:
  95. from core.models import EXTRACTORS
  96. # start = datetime.now()
  97. archive_results = snapshot.archiveresult_set.filter(status="succeeded")
  98. path = snapshot.archive_path
  99. canon = snapshot.canonical_outputs()
  100. output = ""
  101. output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> &nbsp;'
  102. icons = {
  103. "singlefile": "❶",
  104. "wget": "🆆",
  105. "dom": "🅷",
  106. "pdf": "📄",
  107. "screenshot": "💻",
  108. "media": "📼",
  109. "git": "🅶",
  110. "archive_org": "🏛",
  111. "readability": "🆁",
  112. "mercury": "🅼",
  113. "warc": "📦"
  114. }
  115. exclude = ["favicon", "title", "headers", "archive_org"]
  116. # Missing specific entry for WARC
  117. extractor_outputs = defaultdict(lambda: None)
  118. for extractor, _ in EXTRACTORS:
  119. for result in archive_results:
  120. if result.extractor == extractor and result:
  121. extractor_outputs[extractor] = result
  122. for extractor, _ in EXTRACTORS:
  123. if extractor not in exclude:
  124. existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
  125. # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
  126. # if existing:
  127. # existing = (Path(path) / existing)
  128. # if existing.is_file():
  129. # existing = True
  130. # elif existing.is_dir():
  131. # existing = any(existing.glob('*.*'))
  132. output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)),
  133. extractor, icons.get(extractor, "?"))
  134. if extractor == "wget":
  135. # warc isn't technically it's own extractor, so we have to add it after wget
  136. # get from db (faster but less thurthful)
  137. exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
  138. # get from filesystem (slower but more accurate)
  139. # exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
  140. output += format_html(output_template, 'warc/', canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
  141. if extractor == "archive_org":
  142. # The check for archive_org is different, so it has to be handled separately
  143. # get from db (faster)
  144. exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
  145. # get from filesystem (slower)
  146. # target_path = Path(path) / "archive.org.txt"
  147. # exists = target_path.exists()
  148. output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon["archive_org_path"], str(exists),
  149. "archive_org", icons.get("archive_org", "?"))
  150. result = format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
  151. # end = datetime.now()
  152. # print(((end - start).total_seconds()*1000) // 1, 'ms')
  153. return result