index.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293
  1. import os
  2. import json
  3. from datetime import datetime
  4. from string import Template
  5. from typing import List, Tuple, Iterator, Optional, Mapping
  6. from .schema import Link, ArchiveResult
  7. from .config import (
  8. OUTPUT_DIR,
  9. TEMPLATES_DIR,
  10. VERSION,
  11. GIT_SHA,
  12. FOOTER_INFO,
  13. TIMEOUT,
  14. )
  15. from .util import (
  16. ts_to_date,
  17. merge_links,
  18. urlencode,
  19. htmlencode,
  20. urldecode,
  21. derived_link_info,
  22. wget_output_path,
  23. enforce_types,
  24. TimedProgress,
  25. copy_and_overwrite,
  26. atomic_write,
  27. )
  28. from .parse import parse_links
  29. from .links import validate_links
  30. from .logs import (
  31. log_indexing_process_started,
  32. log_indexing_started,
  33. log_indexing_finished,
  34. log_parsing_started,
  35. log_parsing_finished,
  36. )
  37. TITLE_LOADING_MSG = 'Not yet archived...'
  38. ### Homepage index for all the links
  39. @enforce_types
  40. def write_links_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
  41. """create index.html file for a given list of links"""
  42. log_indexing_process_started()
  43. log_indexing_started(out_dir, 'index.json')
  44. timer = TimedProgress(TIMEOUT * 2, prefix=' ')
  45. write_json_links_index(links, out_dir=out_dir)
  46. timer.end()
  47. log_indexing_finished(out_dir, 'index.json')
  48. log_indexing_started(out_dir, 'index.html')
  49. timer = TimedProgress(TIMEOUT * 2, prefix=' ')
  50. write_html_links_index(links, out_dir=out_dir, finished=finished)
  51. timer.end()
  52. log_indexing_finished(out_dir, 'index.html')
  53. @enforce_types
  54. def load_links_index(out_dir: str=OUTPUT_DIR, import_path: Optional[str]=None) -> Tuple[List[Link], List[Link]]:
  55. """parse and load existing index with any new links from import_path merged in"""
  56. existing_links: List[Link] = []
  57. if out_dir:
  58. existing_links = list(parse_json_links_index(out_dir))
  59. new_links: List[Link] = []
  60. if import_path:
  61. # parse and validate the import file
  62. log_parsing_started(import_path)
  63. raw_links, parser_name = parse_links(import_path)
  64. new_links = list(validate_links(raw_links))
  65. # merge existing links in out_dir and new links
  66. all_links = list(validate_links(existing_links + new_links))
  67. if import_path and parser_name:
  68. num_parsed = len(raw_links)
  69. num_new_links = len(all_links) - len(existing_links)
  70. log_parsing_finished(num_parsed, num_new_links, parser_name)
  71. return all_links, new_links
  72. @enforce_types
  73. def write_json_links_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
  74. """write the json link index to a given path"""
  75. assert isinstance(links, List), 'Links must be a list, not a generator.'
  76. assert isinstance(links[0].history, dict)
  77. assert isinstance(links[0].sources, list)
  78. if links[0].history.get('title'):
  79. assert isinstance(links[0].history['title'][0], ArchiveResult)
  80. if links[0].sources:
  81. assert isinstance(links[0].sources[0], str)
  82. path = os.path.join(out_dir, 'index.json')
  83. index_json = {
  84. 'info': 'ArchiveBox Index',
  85. 'source': 'https://github.com/pirate/ArchiveBox',
  86. 'docs': 'https://github.com/pirate/ArchiveBox/wiki',
  87. 'version': VERSION,
  88. 'num_links': len(links),
  89. 'updated': datetime.now(),
  90. 'links': links,
  91. }
  92. atomic_write(index_json, path)
  93. @enforce_types
  94. def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
  95. """parse a archive index json file and return the list of links"""
  96. index_path = os.path.join(out_dir, 'index.json')
  97. if os.path.exists(index_path):
  98. with open(index_path, 'r', encoding='utf-8') as f:
  99. links = json.load(f)['links']
  100. for link_json in links:
  101. yield Link.from_json(link_json)
  102. return ()
  103. @enforce_types
  104. def write_html_links_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
  105. """write the html link index to a given path"""
  106. copy_and_overwrite(
  107. os.path.join(TEMPLATES_DIR, 'static'),
  108. os.path.join(out_dir, 'static'),
  109. )
  110. atomic_write('User-agent: *\nDisallow: /', os.path.join(out_dir, 'robots.txt'))
  111. with open(os.path.join(TEMPLATES_DIR, 'index.html'), 'r', encoding='utf-8') as f:
  112. index_html = f.read()
  113. with open(os.path.join(TEMPLATES_DIR, 'index_row.html'), 'r', encoding='utf-8') as f:
  114. link_row_html = f.read()
  115. link_rows = []
  116. for link in links:
  117. template_row_vars: Mapping[str, str] = {
  118. **derived_link_info(link),
  119. 'title': (
  120. link.title
  121. or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
  122. ),
  123. 'tags': (link.tags or '') + (' {}'.format(link.extension) if link.is_static else ''),
  124. 'favicon_url': (
  125. os.path.join('archive', link.timestamp, 'favicon.ico')
  126. # if link['is_archived'] else 'data:image/gif;base64,R0lGODlhAQABAAD/ACwAAAAAAQABAAACADs='
  127. ),
  128. 'archive_url': urlencode(
  129. wget_output_path(link) or 'index.html'
  130. ),
  131. }
  132. link_rows.append(Template(link_row_html).substitute(**template_row_vars))
  133. template_vars: Mapping[str, str] = {
  134. 'num_links': str(len(links)),
  135. 'date_updated': datetime.now().strftime('%Y-%m-%d'),
  136. 'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
  137. 'footer_info': FOOTER_INFO,
  138. 'version': VERSION,
  139. 'git_sha': GIT_SHA,
  140. 'rows': '\n'.join(link_rows),
  141. 'status': 'finished' if finished else 'running',
  142. }
  143. template_html = Template(index_html).substitute(**template_vars)
  144. atomic_write(template_html, os.path.join(out_dir, 'index.html'))
  145. @enforce_types
  146. def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
  147. """hack to in-place update one row's info in the generated index html"""
  148. title = link.title or link.latest_outputs()['title']
  149. successful = link.num_outputs
  150. # Patch JSON index
  151. json_file_links = parse_json_links_index(out_dir)
  152. patched_links = []
  153. for saved_link in json_file_links:
  154. if saved_link.url == link.url:
  155. patched_links.append(saved_link.overwrite(
  156. title=title,
  157. history=link.history,
  158. updated=link.updated,
  159. ))
  160. else:
  161. patched_links.append(saved_link)
  162. write_json_links_index(patched_links, out_dir=out_dir)
  163. # Patch HTML index
  164. html_path = os.path.join(out_dir, 'index.html')
  165. html = open(html_path, 'r').read().split('\n')
  166. for idx, line in enumerate(html):
  167. if title and ('<span data-title-for="{}"'.format(link.url) in line):
  168. html[idx] = '<span>{}</span>'.format(title)
  169. elif successful and ('<span data-number-for="{}"'.format(link.url) in line):
  170. html[idx] = '<span>{}</span>'.format(successful)
  171. break
  172. atomic_write('\n'.join(html), html_path)
  173. ### Individual link index
  174. @enforce_types
  175. def write_link_index(link: Link, link_dir: Optional[str]=None) -> None:
  176. link_dir = link_dir or link.link_dir
  177. write_json_link_index(link, link_dir)
  178. write_html_link_index(link, link_dir)
  179. @enforce_types
  180. def write_json_link_index(link: Link, link_dir: Optional[str]=None) -> None:
  181. """write a json file with some info about the link"""
  182. link_dir = link_dir or link.link_dir
  183. path = os.path.join(link_dir, 'index.json')
  184. atomic_write(link._asdict(), path)
  185. @enforce_types
  186. def parse_json_link_index(link_dir: str) -> Optional[Link]:
  187. """load the json link index from a given directory"""
  188. existing_index = os.path.join(link_dir, 'index.json')
  189. if os.path.exists(existing_index):
  190. with open(existing_index, 'r', encoding='utf-8') as f:
  191. link_json = json.load(f)
  192. return Link.from_json(link_json)
  193. return None
  194. @enforce_types
  195. def load_json_link_index(link: Link, link_dir: Optional[str]=None) -> Link:
  196. """check for an existing link archive in the given directory,
  197. and load+merge it into the given link dict
  198. """
  199. link_dir = link_dir or link.link_dir
  200. existing_link = parse_json_link_index(link_dir)
  201. if existing_link:
  202. return merge_links(existing_link, link)
  203. return link
  204. @enforce_types
  205. def write_html_link_index(link: Link, link_dir: Optional[str]=None) -> None:
  206. link_dir = link_dir or link.link_dir
  207. with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
  208. link_html = f.read()
  209. path = os.path.join(link_dir, 'index.html')
  210. template_vars: Mapping[str, str] = {
  211. **derived_link_info(link),
  212. 'title': (
  213. link.title
  214. or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
  215. ),
  216. 'url_str': htmlencode(urldecode(link.base_url)),
  217. 'archive_url': urlencode(
  218. wget_output_path(link)
  219. or (link.domain if link.is_archived else 'about:blank')
  220. ),
  221. 'extension': link.extension or 'html',
  222. 'tags': link.tags or 'untagged',
  223. 'status': 'archived' if link.is_archived else 'not yet archived',
  224. 'status_color': 'success' if link.is_archived else 'danger',
  225. 'oldest_archive_date': ts_to_date(link.oldest_archive_date),
  226. }
  227. html_index = Template(link_html).substitute(**template_vars)
  228. atomic_write(html_index, path)