index.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347
  1. __package__ = 'archivebox.legacy'
  2. import os
  3. import json
  4. from typing import List, Tuple, Optional, Iterable
  5. from collections import OrderedDict
  6. from contextlib import contextmanager
  7. from .schema import Link, ArchiveResult
  8. from .config import (
  9. SQL_INDEX_FILENAME,
  10. JSON_INDEX_FILENAME,
  11. HTML_INDEX_FILENAME,
  12. OUTPUT_DIR,
  13. TIMEOUT,
  14. URL_BLACKLIST_PTN,
  15. ANSI,
  16. stderr,
  17. )
  18. from .storage.html import write_html_main_index, write_html_link_details
  19. from .storage.json import (
  20. parse_json_main_index,
  21. write_json_main_index,
  22. parse_json_link_details,
  23. write_json_link_details,
  24. )
  25. from .storage.sql import (
  26. write_sql_main_index,
  27. parse_sql_main_index,
  28. )
  29. from .util import (
  30. scheme,
  31. enforce_types,
  32. TimedProgress,
  33. atomic_write,
  34. ExtendedEncoder,
  35. )
  36. from .parse import parse_links
  37. from .logs import (
  38. log_indexing_process_started,
  39. log_indexing_process_finished,
  40. log_indexing_started,
  41. log_indexing_finished,
  42. log_parsing_started,
  43. log_parsing_finished,
  44. )
  45. ### Link filtering and checking
  46. @enforce_types
  47. def merge_links(a: Link, b: Link) -> Link:
  48. """deterministially merge two links, favoring longer field values over shorter,
  49. and "cleaner" values over worse ones.
  50. """
  51. assert a.base_url == b.base_url, 'Cannot merge two links with different URLs'
  52. # longest url wins (because a fuzzy url will always be shorter)
  53. url = a.url if len(a.url) > len(b.url) else b.url
  54. # best title based on length and quality
  55. possible_titles = [
  56. title
  57. for title in (a.title, b.title)
  58. if title and title.strip() and '://' not in title
  59. ]
  60. title = None
  61. if len(possible_titles) == 2:
  62. title = max(possible_titles, key=lambda t: len(t))
  63. elif len(possible_titles) == 1:
  64. title = possible_titles[0]
  65. # earliest valid timestamp
  66. timestamp = (
  67. a.timestamp
  68. if float(a.timestamp or 0) < float(b.timestamp or 0) else
  69. b.timestamp
  70. )
  71. # all unique, truthy tags
  72. tags_set = (
  73. set(tag.strip() for tag in (a.tags or '').split(','))
  74. | set(tag.strip() for tag in (b.tags or '').split(','))
  75. )
  76. tags = ','.join(tags_set) or None
  77. # all unique source entries
  78. sources = list(set(a.sources + b.sources))
  79. # all unique history entries for the combined archive methods
  80. all_methods = set(list(a.history.keys()) + list(a.history.keys()))
  81. history = {
  82. method: (a.history.get(method) or []) + (b.history.get(method) or [])
  83. for method in all_methods
  84. }
  85. for method in all_methods:
  86. deduped_jsons = {
  87. json.dumps(result, sort_keys=True, cls=ExtendedEncoder)
  88. for result in history[method]
  89. }
  90. history[method] = list(reversed(sorted(
  91. (ArchiveResult.from_json(json.loads(result)) for result in deduped_jsons),
  92. key=lambda result: result.start_ts,
  93. )))
  94. return Link(
  95. url=url,
  96. timestamp=timestamp,
  97. title=title,
  98. tags=tags,
  99. sources=sources,
  100. history=history,
  101. )
  102. @enforce_types
  103. def validate_links(links: Iterable[Link]) -> Iterable[Link]:
  104. links = archivable_links(links) # remove chrome://, about:, mailto: etc.
  105. links = sorted_links(links) # deterministically sort the links based on timstamp, url
  106. links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls
  107. if not links:
  108. stderr('{red}[X] No links found in index.{reset}'.format(**ANSI))
  109. stderr(' To add a link to your archive, run:')
  110. stderr(" archivebox add 'https://example.com'")
  111. stderr()
  112. stderr(' For more usage and examples, run:')
  113. stderr(' archivebox help')
  114. raise SystemExit(1)
  115. return links
  116. @enforce_types
  117. def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
  118. """remove chrome://, about:// or other schemed links that cant be archived"""
  119. for link in links:
  120. scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp')
  121. not_blacklisted = (not URL_BLACKLIST_PTN.match(link.url)) if URL_BLACKLIST_PTN else True
  122. if scheme_is_valid and not_blacklisted:
  123. yield link
  124. @enforce_types
  125. def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
  126. """
  127. ensures that all non-duplicate links have monotonically increasing timestamps
  128. """
  129. unique_urls: OrderedDict[str, Link] = OrderedDict()
  130. for link in sorted_links:
  131. if link.base_url in unique_urls:
  132. # merge with any other links that share the same url
  133. link = merge_links(unique_urls[link.base_url], link)
  134. unique_urls[link.base_url] = link
  135. unique_timestamps: OrderedDict[str, Link] = OrderedDict()
  136. for link in unique_urls.values():
  137. new_link = link.overwrite(
  138. timestamp=lowest_uniq_timestamp(unique_timestamps, link.timestamp),
  139. )
  140. unique_timestamps[new_link.timestamp] = new_link
  141. return unique_timestamps.values()
  142. @enforce_types
  143. def sorted_links(links: Iterable[Link]) -> Iterable[Link]:
  144. sort_func = lambda link: (link.timestamp.split('.', 1)[0], link.url)
  145. return sorted(links, key=sort_func, reverse=True)
  146. @enforce_types
  147. def links_after_timestamp(links: Iterable[Link], resume: Optional[float]=None) -> Iterable[Link]:
  148. if not resume:
  149. yield from links
  150. return
  151. for link in links:
  152. try:
  153. if float(link.timestamp) <= resume:
  154. yield link
  155. except (ValueError, TypeError):
  156. print('Resume value and all timestamp values must be valid numbers.')
  157. @enforce_types
  158. def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
  159. """resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2"""
  160. timestamp = timestamp.split('.')[0]
  161. nonce = 0
  162. # first try 152323423 before 152323423.0
  163. if timestamp not in used_timestamps:
  164. return timestamp
  165. new_timestamp = '{}.{}'.format(timestamp, nonce)
  166. while new_timestamp in used_timestamps:
  167. nonce += 1
  168. new_timestamp = '{}.{}'.format(timestamp, nonce)
  169. return new_timestamp
  170. ### Main Links Index
  171. @contextmanager
  172. @enforce_types
  173. def timed_index_update(out_path: str):
  174. log_indexing_started(out_path)
  175. timer = TimedProgress(TIMEOUT * 2, prefix=' ')
  176. try:
  177. yield
  178. finally:
  179. timer.end()
  180. assert os.path.exists(out_path), f'Failed to write index file: {out_path}'
  181. log_indexing_finished(out_path)
  182. @enforce_types
  183. def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
  184. """create index.html file for a given list of links"""
  185. log_indexing_process_started(len(links))
  186. with timed_index_update(os.path.join(out_dir, SQL_INDEX_FILENAME)):
  187. write_sql_main_index(links, out_dir=out_dir)
  188. with timed_index_update(os.path.join(out_dir, JSON_INDEX_FILENAME)):
  189. write_json_main_index(links, out_dir=out_dir)
  190. with timed_index_update(os.path.join(out_dir, HTML_INDEX_FILENAME)):
  191. write_html_main_index(links, out_dir=out_dir, finished=finished)
  192. log_indexing_process_finished()
  193. @enforce_types
  194. def load_main_index(out_dir: str=OUTPUT_DIR, warn: bool=True) -> List[Link]:
  195. """parse and load existing index with any new links from import_path merged in"""
  196. all_links: List[Link] = []
  197. all_links = list(parse_json_main_index(out_dir))
  198. links_from_sql = list(parse_sql_main_index(out_dir))
  199. if warn and not set(l.url for l in all_links) == set(l.url for l in links_from_sql):
  200. stderr('{red}[!] Warning: SQL index does not match JSON index!{reset}'.format(**ANSI))
  201. stderr(' To repair the index and re-import any orphaned links run:')
  202. stderr(' archivebox init')
  203. return all_links
  204. @enforce_types
  205. def load_main_index_meta(out_dir: str=OUTPUT_DIR) -> Optional[dict]:
  206. index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
  207. if os.path.exists(index_path):
  208. with open(index_path, 'r', encoding='utf-8') as f:
  209. meta_dict = json.load(f)
  210. meta_dict.pop('links')
  211. return meta_dict
  212. return None
  213. @enforce_types
  214. def import_new_links(existing_links: List[Link], import_path: str) -> Tuple[List[Link], List[Link]]:
  215. new_links: List[Link] = []
  216. # parse and validate the import file
  217. log_parsing_started(import_path)
  218. raw_links, parser_name = parse_links(import_path)
  219. new_links = list(validate_links(raw_links))
  220. # merge existing links in out_dir and new links
  221. all_links = list(validate_links(existing_links + new_links))
  222. if parser_name:
  223. num_parsed = len(raw_links)
  224. num_new_links = len(all_links) - len(existing_links)
  225. log_parsing_finished(num_parsed, num_new_links, parser_name)
  226. return all_links, new_links
  227. @enforce_types
  228. def patch_main_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
  229. """hack to in-place update one row's info in the generated index files"""
  230. # TODO: remove this ASAP, it's ugly, error-prone, and potentially dangerous
  231. title = link.title or link.latest_outputs(status='succeeded')['title']
  232. successful = link.num_outputs
  233. # Patch JSON main index
  234. json_file_links = parse_json_main_index(out_dir)
  235. patched_links = []
  236. for saved_link in json_file_links:
  237. if saved_link.url == link.url:
  238. patched_links.append(saved_link.overwrite(
  239. title=title,
  240. history=link.history,
  241. updated=link.updated,
  242. ))
  243. else:
  244. patched_links.append(saved_link)
  245. write_json_main_index(patched_links, out_dir=out_dir)
  246. # Patch HTML main index
  247. html_path = os.path.join(out_dir, 'index.html')
  248. with open(html_path, 'r') as f:
  249. html = f.read().split('\n')
  250. for idx, line in enumerate(html):
  251. if title and ('<span data-title-for="{}"'.format(link.url) in line):
  252. html[idx] = '<span>{}</span>'.format(title)
  253. elif successful and ('<span data-number-for="{}"'.format(link.url) in line):
  254. html[idx] = '<span>{}</span>'.format(successful)
  255. break
  256. atomic_write('\n'.join(html), html_path)
  257. ### Link Details Index
  258. @enforce_types
  259. def write_link_details(link: Link, out_dir: Optional[str]=None) -> None:
  260. out_dir = out_dir or link.link_dir
  261. write_json_link_details(link, out_dir=out_dir)
  262. write_html_link_details(link, out_dir=out_dir)
  263. @enforce_types
  264. def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link:
  265. """check for an existing link archive in the given directory,
  266. and load+merge it into the given link dict
  267. """
  268. out_dir = out_dir or link.link_dir
  269. existing_link = parse_json_link_details(out_dir)
  270. if existing_link:
  271. return merge_links(existing_link, link)
  272. return link