index.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287
  1. import os
  2. import json
  3. from datetime import datetime
  4. from string import Template
  5. from typing import List, Tuple, Iterator, Optional
  6. from .schema import Link, ArchiveResult
  7. from .config import (
  8. OUTPUT_DIR,
  9. TEMPLATES_DIR,
  10. VERSION,
  11. GIT_SHA,
  12. FOOTER_INFO,
  13. TIMEOUT,
  14. )
  15. from .util import (
  16. merge_links,
  17. urlencode,
  18. derived_link_info,
  19. wget_output_path,
  20. enforce_types,
  21. TimedProgress,
  22. copy_and_overwrite,
  23. atomic_write,
  24. )
  25. from .parse import parse_links
  26. from .links import validate_links
  27. from .logs import (
  28. log_indexing_process_started,
  29. log_indexing_started,
  30. log_indexing_finished,
  31. log_parsing_started,
  32. log_parsing_finished,
  33. )
  34. TITLE_LOADING_MSG = 'Not yet archived...'
  35. ### Homepage index for all the links
  36. @enforce_types
  37. def write_links_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
  38. """create index.html file for a given list of links"""
  39. log_indexing_process_started()
  40. log_indexing_started(out_dir, 'index.json')
  41. timer = TimedProgress(TIMEOUT * 2, prefix=' ')
  42. write_json_links_index(links, out_dir=out_dir)
  43. timer.end()
  44. log_indexing_finished(out_dir, 'index.json')
  45. log_indexing_started(out_dir, 'index.html')
  46. timer = TimedProgress(TIMEOUT * 2, prefix=' ')
  47. write_html_links_index(links, out_dir=out_dir, finished=finished)
  48. timer.end()
  49. log_indexing_finished(out_dir, 'index.html')
  50. @enforce_types
  51. def load_links_index(out_dir: str=OUTPUT_DIR, import_path: Optional[str]=None) -> Tuple[List[Link], List[Link]]:
  52. """parse and load existing index with any new links from import_path merged in"""
  53. existing_links: List[Link] = []
  54. if out_dir:
  55. existing_links = list(parse_json_links_index(out_dir))
  56. new_links: List[Link] = []
  57. if import_path:
  58. # parse and validate the import file
  59. log_parsing_started(import_path)
  60. raw_links, parser_name = parse_links(import_path)
  61. new_links = list(validate_links(raw_links))
  62. # merge existing links in out_dir and new links
  63. all_links = list(validate_links(existing_links + new_links))
  64. if import_path and parser_name:
  65. num_parsed = len(raw_links)
  66. num_new_links = len(all_links) - len(existing_links)
  67. log_parsing_finished(num_parsed, num_new_links, parser_name)
  68. return all_links, new_links
  69. @enforce_types
  70. def write_json_links_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
  71. """write the json link index to a given path"""
  72. assert isinstance(links, List), 'Links must be a list, not a generator.'
  73. assert isinstance(links[0].history, dict)
  74. assert isinstance(links[0].sources, list)
  75. if links[0].history.get('title'):
  76. assert isinstance(links[0].history['title'][0], ArchiveResult)
  77. if links[0].sources:
  78. assert isinstance(links[0].sources[0], str)
  79. path = os.path.join(out_dir, 'index.json')
  80. index_json = {
  81. 'info': 'ArchiveBox Index',
  82. 'source': 'https://github.com/pirate/ArchiveBox',
  83. 'docs': 'https://github.com/pirate/ArchiveBox/wiki',
  84. 'version': VERSION,
  85. 'num_links': len(links),
  86. 'updated': datetime.now(),
  87. 'links': links,
  88. }
  89. atomic_write(index_json, path)
  90. @enforce_types
  91. def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
  92. """parse a archive index json file and return the list of links"""
  93. index_path = os.path.join(out_dir, 'index.json')
  94. if os.path.exists(index_path):
  95. with open(index_path, 'r', encoding='utf-8') as f:
  96. links = json.load(f)['links']
  97. for link_json in links:
  98. yield Link.from_json(link_json)
  99. return ()
  100. @enforce_types
  101. def write_html_links_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
  102. """write the html link index to a given path"""
  103. path = os.path.join(out_dir, 'index.html')
  104. copy_and_overwrite(
  105. os.path.join(TEMPLATES_DIR, 'static'),
  106. os.path.join(out_dir, 'static'),
  107. )
  108. atomic_write('User-agent: *\nDisallow: /', os.path.join(out_dir, 'robots.txt'))
  109. with open(os.path.join(TEMPLATES_DIR, 'index.html'), 'r', encoding='utf-8') as f:
  110. index_html = f.read()
  111. with open(os.path.join(TEMPLATES_DIR, 'index_row.html'), 'r', encoding='utf-8') as f:
  112. link_row_html = f.read()
  113. link_rows = '\n'.join(
  114. Template(link_row_html).substitute(**{
  115. **derived_link_info(link),
  116. 'title': (
  117. link.title
  118. or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
  119. ),
  120. 'tags': (link.tags or '') + (' {}'.format(link.extension) if link.is_static else ''),
  121. 'favicon_url': (
  122. os.path.join('archive', link.timestamp, 'favicon.ico')
  123. # if link['is_archived'] else 'data:image/gif;base64,R0lGODlhAQABAAD/ACwAAAAAAQABAAACADs='
  124. ),
  125. 'archive_url': urlencode(
  126. wget_output_path(link) or 'index.html'
  127. ),
  128. })
  129. for link in links
  130. )
  131. template_vars = {
  132. 'num_links': len(links),
  133. 'date_updated': datetime.now().strftime('%Y-%m-%d'),
  134. 'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
  135. 'footer_info': FOOTER_INFO,
  136. 'version': VERSION,
  137. 'git_sha': GIT_SHA,
  138. 'rows': link_rows,
  139. 'status': 'finished' if finished else 'running',
  140. }
  141. atomic_write(Template(index_html).substitute(**template_vars), path)
  142. @enforce_types
  143. def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
  144. """hack to in-place update one row's info in the generated index html"""
  145. title = link.title or link.latest_outputs()['title']
  146. successful = link.num_outputs
  147. # Patch JSON index
  148. json_file_links = parse_json_links_index(out_dir)
  149. patched_links = []
  150. for saved_link in json_file_links:
  151. if saved_link.url == link.url:
  152. patched_links.append(saved_link.overwrite(
  153. title=title,
  154. history=link.history,
  155. updated=link.updated,
  156. ))
  157. else:
  158. patched_links.append(saved_link)
  159. write_json_links_index(patched_links, out_dir=out_dir)
  160. # Patch HTML index
  161. html_path = os.path.join(out_dir, 'index.html')
  162. html = open(html_path, 'r').read().split('\n')
  163. for idx, line in enumerate(html):
  164. if title and ('<span data-title-for="{}"'.format(link.url) in line):
  165. html[idx] = '<span>{}</span>'.format(title)
  166. elif successful and ('<span data-number-for="{}"'.format(link.url) in line):
  167. html[idx] = '<span>{}</span>'.format(successful)
  168. break
  169. atomic_write('\n'.join(html), html_path)
  170. ### Individual link index
  171. @enforce_types
  172. def write_link_index(link: Link, link_dir: Optional[str]=None) -> None:
  173. link_dir = link_dir or link.link_dir
  174. write_json_link_index(link, link_dir)
  175. write_html_link_index(link, link_dir)
  176. @enforce_types
  177. def write_json_link_index(link: Link, link_dir: Optional[str]=None) -> None:
  178. """write a json file with some info about the link"""
  179. link_dir = link_dir or link.link_dir
  180. path = os.path.join(link_dir, 'index.json')
  181. atomic_write(link._asdict(), path)
  182. @enforce_types
  183. def parse_json_link_index(link_dir: str) -> Optional[Link]:
  184. """load the json link index from a given directory"""
  185. existing_index = os.path.join(link_dir, 'index.json')
  186. if os.path.exists(existing_index):
  187. with open(existing_index, 'r', encoding='utf-8') as f:
  188. link_json = json.load(f)
  189. return Link.from_json(link_json)
  190. return None
  191. @enforce_types
  192. def load_json_link_index(link: Link, link_dir: Optional[str]=None) -> Link:
  193. """check for an existing link archive in the given directory,
  194. and load+merge it into the given link dict
  195. """
  196. link_dir = link_dir or link.link_dir
  197. existing_link = parse_json_link_index(link_dir)
  198. if existing_link:
  199. return merge_links(existing_link, link)
  200. return link
  201. @enforce_types
  202. def write_html_link_index(link: Link, link_dir: Optional[str]=None) -> None:
  203. link_dir = link_dir or link.link_dir
  204. with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
  205. link_html = f.read()
  206. path = os.path.join(link_dir, 'index.html')
  207. html_index = Template(link_html).substitute({
  208. **derived_link_info(link),
  209. 'title': (
  210. link.title
  211. or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
  212. ),
  213. 'archive_url': urlencode(
  214. wget_output_path(link)
  215. or (link.domain if link.is_archived else 'about:blank')
  216. ),
  217. 'extension': link.extension or 'html',
  218. 'tags': link.tags or 'untagged',
  219. 'status': 'archived' if link.is_archived else 'not yet archived',
  220. 'status_color': 'success' if link.is_archived else 'danger',
  221. })
  222. atomic_write(html_index, path)