index.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456
  1. import os
  2. import json
  3. from datetime import datetime
  4. from string import Template
  5. from typing import List, Tuple, Iterator, Optional, Mapping, Iterable
  6. from collections import OrderedDict
  7. from .schema import Link, ArchiveResult
  8. from .config import (
  9. OUTPUT_DIR,
  10. TEMPLATES_DIR,
  11. VERSION,
  12. GIT_SHA,
  13. FOOTER_INFO,
  14. TIMEOUT,
  15. URL_BLACKLIST_PTN,
  16. ANSI,
  17. stderr,
  18. )
  19. from .util import (
  20. scheme,
  21. fuzzy_url,
  22. ts_to_date,
  23. urlencode,
  24. htmlencode,
  25. urldecode,
  26. wget_output_path,
  27. enforce_types,
  28. TimedProgress,
  29. copy_and_overwrite,
  30. atomic_write,
  31. ExtendedEncoder,
  32. )
  33. from .parse import parse_links
  34. from .logs import (
  35. log_indexing_process_started,
  36. log_indexing_started,
  37. log_indexing_finished,
  38. log_parsing_started,
  39. log_parsing_finished,
  40. )
  41. TITLE_LOADING_MSG = 'Not yet archived...'
  42. ### Link filtering and checking
  43. @enforce_types
  44. def derived_link_info(link: Link) -> dict:
  45. """extend link info with the archive urls and other derived data"""
  46. info = link._asdict(extended=True)
  47. info.update(link.canonical_outputs())
  48. return info
  49. @enforce_types
  50. def merge_links(a: Link, b: Link) -> Link:
  51. """deterministially merge two links, favoring longer field values over shorter,
  52. and "cleaner" values over worse ones.
  53. """
  54. assert a.base_url == b.base_url, 'Cannot merge two links with different URLs'
  55. url = a.url if len(a.url) > len(b.url) else b.url
  56. possible_titles = [
  57. title
  58. for title in (a.title, b.title)
  59. if title and title.strip() and '://' not in title
  60. ]
  61. title = None
  62. if len(possible_titles) == 2:
  63. title = max(possible_titles, key=lambda t: len(t))
  64. elif len(possible_titles) == 1:
  65. title = possible_titles[0]
  66. timestamp = (
  67. a.timestamp
  68. if float(a.timestamp or 0) < float(b.timestamp or 0) else
  69. b.timestamp
  70. )
  71. tags_set = (
  72. set(tag.strip() for tag in (a.tags or '').split(','))
  73. | set(tag.strip() for tag in (b.tags or '').split(','))
  74. )
  75. tags = ','.join(tags_set) or None
  76. sources = list(set(a.sources + b.sources))
  77. all_methods = set(list(a.history.keys()) + list(a.history.keys()))
  78. history = {
  79. method: (a.history.get(method) or []) + (b.history.get(method) or [])
  80. for method in all_methods
  81. }
  82. for method in all_methods:
  83. deduped_jsons = {
  84. json.dumps(result, sort_keys=True, cls=ExtendedEncoder)
  85. for result in history[method]
  86. }
  87. history[method] = list(reversed(sorted(
  88. (ArchiveResult.from_json(json.loads(result)) for result in deduped_jsons),
  89. key=lambda result: result.start_ts,
  90. )))
  91. return Link(
  92. url=url,
  93. timestamp=timestamp,
  94. title=title,
  95. tags=tags,
  96. sources=sources,
  97. history=history,
  98. )
  99. def validate_links(links: Iterable[Link]) -> Iterable[Link]:
  100. links = archivable_links(links) # remove chrome://, about:, mailto: etc.
  101. links = sorted_links(links) # deterministically sort the links based on timstamp, url
  102. links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls
  103. if not links:
  104. stderr('{red}[X] No links found in index.json{reset}'.format(**ANSI))
  105. stderr(' To add a link to your archive, run:')
  106. stderr(" archivebox add 'https://example.com'")
  107. stderr()
  108. stderr(' For more usage and examples, run:')
  109. stderr(' archivebox help')
  110. raise SystemExit(1)
  111. return links
  112. def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
  113. """remove chrome://, about:// or other schemed links that cant be archived"""
  114. for link in links:
  115. scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp')
  116. not_blacklisted = (not URL_BLACKLIST_PTN.match(link.url)) if URL_BLACKLIST_PTN else True
  117. if scheme_is_valid and not_blacklisted:
  118. yield link
  119. def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
  120. """
  121. ensures that all non-duplicate links have monotonically increasing timestamps
  122. """
  123. unique_urls: OrderedDict[str, Link] = OrderedDict()
  124. for link in sorted_links:
  125. if link.base_url in unique_urls:
  126. # merge with any other links that share the same url
  127. link = merge_links(unique_urls[link.base_url], link)
  128. unique_urls[link.base_url] = link
  129. unique_timestamps: OrderedDict[str, Link] = OrderedDict()
  130. for link in unique_urls.values():
  131. new_link = link.overwrite(
  132. timestamp=lowest_uniq_timestamp(unique_timestamps, link.timestamp),
  133. )
  134. unique_timestamps[new_link.timestamp] = new_link
  135. return unique_timestamps.values()
  136. def sorted_links(links: Iterable[Link]) -> Iterable[Link]:
  137. sort_func = lambda link: (link.timestamp.split('.', 1)[0], link.url)
  138. return sorted(links, key=sort_func, reverse=True)
  139. def links_after_timestamp(links: Iterable[Link], resume: float=None) -> Iterable[Link]:
  140. if not resume:
  141. yield from links
  142. return
  143. for link in links:
  144. try:
  145. if float(link.timestamp) <= resume:
  146. yield link
  147. except (ValueError, TypeError):
  148. print('Resume value and all timestamp values must be valid numbers.')
  149. def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
  150. """resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2"""
  151. timestamp = timestamp.split('.')[0]
  152. nonce = 0
  153. # first try 152323423 before 152323423.0
  154. if timestamp not in used_timestamps:
  155. return timestamp
  156. new_timestamp = '{}.{}'.format(timestamp, nonce)
  157. while new_timestamp in used_timestamps:
  158. nonce += 1
  159. new_timestamp = '{}.{}'.format(timestamp, nonce)
  160. return new_timestamp
  161. ### Homepage index for all the links
  162. @enforce_types
  163. def write_links_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
  164. """create index.html file for a given list of links"""
  165. log_indexing_process_started()
  166. log_indexing_started(out_dir, 'index.json')
  167. timer = TimedProgress(TIMEOUT * 2, prefix=' ')
  168. try:
  169. write_json_links_index(links, out_dir=out_dir)
  170. finally:
  171. timer.end()
  172. log_indexing_finished(out_dir, 'index.json')
  173. log_indexing_started(out_dir, 'index.html')
  174. timer = TimedProgress(TIMEOUT * 2, prefix=' ')
  175. try:
  176. write_html_links_index(links, out_dir=out_dir, finished=finished)
  177. finally:
  178. timer.end()
  179. log_indexing_finished(out_dir, 'index.html')
  180. @enforce_types
  181. def load_links_index(out_dir: str=OUTPUT_DIR, import_path: Optional[str]=None) -> Tuple[List[Link], List[Link]]:
  182. """parse and load existing index with any new links from import_path merged in"""
  183. existing_links: List[Link] = []
  184. if out_dir:
  185. existing_links = list(parse_json_links_index(out_dir))
  186. new_links: List[Link] = []
  187. if import_path:
  188. # parse and validate the import file
  189. log_parsing_started(import_path)
  190. raw_links, parser_name = parse_links(import_path)
  191. new_links = list(validate_links(raw_links))
  192. # merge existing links in out_dir and new links
  193. all_links = list(validate_links(existing_links + new_links))
  194. if import_path and parser_name:
  195. num_parsed = len(raw_links)
  196. num_new_links = len(all_links) - len(existing_links)
  197. log_parsing_finished(num_parsed, num_new_links, parser_name)
  198. return all_links, new_links
  199. @enforce_types
  200. def write_json_links_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
  201. """write the json link index to a given path"""
  202. assert isinstance(links, List), 'Links must be a list, not a generator.'
  203. assert not links or isinstance(links[0].history, dict)
  204. assert not links or isinstance(links[0].sources, list)
  205. if links and links[0].history.get('title'):
  206. assert isinstance(links[0].history['title'][0], ArchiveResult)
  207. if links and links[0].sources:
  208. assert isinstance(links[0].sources[0], str)
  209. path = os.path.join(out_dir, 'index.json')
  210. index_json = {
  211. 'info': 'ArchiveBox Index',
  212. 'source': 'https://github.com/pirate/ArchiveBox',
  213. 'docs': 'https://github.com/pirate/ArchiveBox/wiki',
  214. 'version': VERSION,
  215. 'num_links': len(links),
  216. 'updated': datetime.now(),
  217. 'links': links,
  218. }
  219. atomic_write(index_json, path)
  220. @enforce_types
  221. def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
  222. """parse a archive index json file and return the list of links"""
  223. index_path = os.path.join(out_dir, 'index.json')
  224. if os.path.exists(index_path):
  225. with open(index_path, 'r', encoding='utf-8') as f:
  226. links = json.load(f)['links']
  227. for link_json in links:
  228. yield Link.from_json(link_json)
  229. return ()
  230. @enforce_types
  231. def write_html_links_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
  232. """write the html link index to a given path"""
  233. copy_and_overwrite(
  234. os.path.join(TEMPLATES_DIR, 'static'),
  235. os.path.join(out_dir, 'static'),
  236. )
  237. atomic_write('User-agent: *\nDisallow: /', os.path.join(out_dir, 'robots.txt'))
  238. with open(os.path.join(TEMPLATES_DIR, 'index.html'), 'r', encoding='utf-8') as f:
  239. index_html = f.read()
  240. with open(os.path.join(TEMPLATES_DIR, 'index_row.html'), 'r', encoding='utf-8') as f:
  241. link_row_html = f.read()
  242. link_rows = []
  243. for link in links:
  244. template_row_vars: Mapping[str, str] = {
  245. **derived_link_info(link),
  246. 'title': (
  247. link.title
  248. or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
  249. ),
  250. 'tags': (link.tags or '') + (' {}'.format(link.extension) if link.is_static else ''),
  251. 'favicon_url': (
  252. os.path.join('archive', link.timestamp, 'favicon.ico')
  253. # if link['is_archived'] else 'data:image/gif;base64,R0lGODlhAQABAAD/ACwAAAAAAQABAAACADs='
  254. ),
  255. 'archive_url': urlencode(
  256. wget_output_path(link) or 'index.html'
  257. ),
  258. }
  259. link_rows.append(Template(link_row_html).substitute(**template_row_vars))
  260. template_vars: Mapping[str, str] = {
  261. 'num_links': str(len(links)),
  262. 'date_updated': datetime.now().strftime('%Y-%m-%d'),
  263. 'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
  264. 'footer_info': FOOTER_INFO,
  265. 'version': VERSION,
  266. 'git_sha': GIT_SHA,
  267. 'rows': '\n'.join(link_rows),
  268. 'status': 'finished' if finished else 'running',
  269. }
  270. template_html = Template(index_html).substitute(**template_vars)
  271. atomic_write(template_html, os.path.join(out_dir, 'index.html'))
  272. @enforce_types
  273. def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
  274. """hack to in-place update one row's info in the generated index html"""
  275. title = link.title or link.latest_outputs()['title']
  276. successful = link.num_outputs
  277. # Patch JSON index
  278. json_file_links = parse_json_links_index(out_dir)
  279. patched_links = []
  280. for saved_link in json_file_links:
  281. if saved_link.url == link.url:
  282. patched_links.append(saved_link.overwrite(
  283. title=title,
  284. history=link.history,
  285. updated=link.updated,
  286. ))
  287. else:
  288. patched_links.append(saved_link)
  289. write_json_links_index(patched_links, out_dir=out_dir)
  290. # Patch HTML index
  291. html_path = os.path.join(out_dir, 'index.html')
  292. html = open(html_path, 'r').read().split('\n')
  293. for idx, line in enumerate(html):
  294. if title and ('<span data-title-for="{}"'.format(link.url) in line):
  295. html[idx] = '<span>{}</span>'.format(title)
  296. elif successful and ('<span data-number-for="{}"'.format(link.url) in line):
  297. html[idx] = '<span>{}</span>'.format(successful)
  298. break
  299. atomic_write('\n'.join(html), html_path)
  300. ### Individual link index
  301. @enforce_types
  302. def write_link_index(link: Link, link_dir: Optional[str]=None) -> None:
  303. link_dir = link_dir or link.link_dir
  304. write_json_link_index(link, link_dir)
  305. write_html_link_index(link, link_dir)
  306. @enforce_types
  307. def write_json_link_index(link: Link, link_dir: Optional[str]=None) -> None:
  308. """write a json file with some info about the link"""
  309. link_dir = link_dir or link.link_dir
  310. path = os.path.join(link_dir, 'index.json')
  311. atomic_write(link._asdict(), path)
  312. @enforce_types
  313. def parse_json_link_index(link_dir: str) -> Optional[Link]:
  314. """load the json link index from a given directory"""
  315. existing_index = os.path.join(link_dir, 'index.json')
  316. if os.path.exists(existing_index):
  317. with open(existing_index, 'r', encoding='utf-8') as f:
  318. link_json = json.load(f)
  319. return Link.from_json(link_json)
  320. return None
  321. @enforce_types
  322. def load_json_link_index(link: Link, link_dir: Optional[str]=None) -> Link:
  323. """check for an existing link archive in the given directory,
  324. and load+merge it into the given link dict
  325. """
  326. link_dir = link_dir or link.link_dir
  327. existing_link = parse_json_link_index(link_dir)
  328. if existing_link:
  329. return merge_links(existing_link, link)
  330. return link
  331. @enforce_types
  332. def write_html_link_index(link: Link, link_dir: Optional[str]=None) -> None:
  333. link_dir = link_dir or link.link_dir
  334. with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
  335. link_html = f.read()
  336. path = os.path.join(link_dir, 'index.html')
  337. template_vars: Mapping[str, str] = {
  338. **derived_link_info(link),
  339. 'title': (
  340. link.title
  341. or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
  342. ),
  343. 'url_str': htmlencode(urldecode(link.base_url)),
  344. 'archive_url': urlencode(
  345. wget_output_path(link)
  346. or (link.domain if link.is_archived else 'about:blank')
  347. ),
  348. 'extension': link.extension or 'html',
  349. 'tags': link.tags or 'untagged',
  350. 'status': 'archived' if link.is_archived else 'not yet archived',
  351. 'status_color': 'success' if link.is_archived else 'danger',
  352. 'oldest_archive_date': ts_to_date(link.oldest_archive_date),
  353. }
  354. html_index = Template(link_html).substitute(**template_vars)
  355. atomic_write(html_index, path)