__init__.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
  1. """
  2. Everything related to parsing links from input sources.
  3. For a list of supported services, see the README.md.
  4. For examples of supported import formats see tests/.
  5. """
  6. __package__ = 'archivebox.parsers'
  7. from io import StringIO
  8. from typing import IO, Tuple, List, Optional
  9. from datetime import datetime, timezone
  10. from pathlib import Path
  11. from ..system import atomic_write
  12. from ..config import (
  13. ANSI,
  14. OUTPUT_DIR,
  15. SOURCES_DIR_NAME,
  16. TIMEOUT,
  17. stderr,
  18. hint,
  19. )
  20. from ..util import (
  21. basename,
  22. htmldecode,
  23. download_url,
  24. enforce_types,
  25. )
  26. from ..index.schema import Link
  27. from ..logging_util import TimedProgress, log_source_saved
  28. from . import pocket_api
  29. from . import readwise_reader_api
  30. from . import wallabag_atom
  31. from . import pocket_html
  32. from . import pinboard_rss
  33. from . import shaarli_rss
  34. from . import medium_rss
  35. from . import netscape_html
  36. from . import generic_rss
  37. from . import generic_json
  38. from . import generic_jsonl
  39. from . import generic_html
  40. from . import generic_txt
  41. from . import url_list
  42. PARSERS = {
  43. # Specialized parsers
  44. pocket_api.KEY: (pocket_api.NAME, pocket_api.PARSER),
  45. readwise_reader_api.KEY: (readwise_reader_api.NAME, readwise_reader_api.PARSER),
  46. wallabag_atom.KEY: (wallabag_atom.NAME, wallabag_atom.PARSER),
  47. pocket_html.KEY: (pocket_html.NAME, pocket_html.PARSER),
  48. pinboard_rss.KEY: (pinboard_rss.NAME, pinboard_rss.PARSER),
  49. shaarli_rss.KEY: (shaarli_rss.NAME, shaarli_rss.PARSER),
  50. medium_rss.KEY: (medium_rss.NAME, medium_rss.PARSER),
  51. # General parsers
  52. netscape_html.KEY: (netscape_html.NAME, netscape_html.PARSER),
  53. generic_rss.KEY: (generic_rss.NAME, generic_rss.PARSER),
  54. generic_json.KEY: (generic_json.NAME, generic_json.PARSER),
  55. generic_jsonl.KEY: (generic_jsonl.NAME, generic_jsonl.PARSER),
  56. generic_html.KEY: (generic_html.NAME, generic_html.PARSER),
  57. # Catchall fallback parser
  58. generic_txt.KEY: (generic_txt.NAME, generic_txt.PARSER),
  59. # Explicitly specified parsers
  60. url_list.KEY: (url_list.NAME, url_list.PARSER),
  61. }
  62. @enforce_types
  63. def parse_links_memory(urls: List[str], root_url: Optional[str]=None):
  64. """
  65. parse a list of URLS without touching the filesystem
  66. """
  67. timer = TimedProgress(TIMEOUT * 4)
  68. #urls = list(map(lambda x: x + "\n", urls))
  69. file = StringIO()
  70. file.writelines(urls)
  71. file.name = "io_string"
  72. links, parser = run_parser_functions(file, timer, root_url=root_url)
  73. timer.end()
  74. if parser is None:
  75. return [], 'Failed to parse'
  76. return links, parser
  77. @enforce_types
  78. def parse_links(source_file: str, root_url: Optional[str]=None, parser: str="auto") -> Tuple[List[Link], str]:
  79. """parse a list of URLs with their metadata from an
  80. RSS feed, bookmarks export, or text file
  81. """
  82. timer = TimedProgress(TIMEOUT * 4)
  83. with open(source_file, 'r', encoding='utf-8') as file:
  84. links, parser = run_parser_functions(file, timer, root_url=root_url, parser=parser)
  85. timer.end()
  86. if parser is None:
  87. return [], 'Failed to parse'
  88. return links, parser
  89. def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None, parser: str="auto") -> Tuple[List[Link], Optional[str]]:
  90. most_links: List[Link] = []
  91. best_parser_name = None
  92. if parser != "auto":
  93. parser_name, parser_func = PARSERS[parser]
  94. parsed_links = list(parser_func(to_parse, root_url=root_url))
  95. if not parsed_links:
  96. stderr()
  97. stderr(f'[X] No links found using {parser_name} parser', color='red')
  98. hint('Try a different parser or double check the input?')
  99. stderr()
  100. timer.end()
  101. return parsed_links, parser_name
  102. for parser_id in PARSERS:
  103. parser_name, parser_func = PARSERS[parser_id]
  104. try:
  105. parsed_links = list(parser_func(to_parse, root_url=root_url))
  106. if not parsed_links:
  107. raise Exception(f'No links found using {parser_name} parser')
  108. # print(f'[√] Parser {parser_name} succeeded: {len(parsed_links)} links parsed')
  109. if len(parsed_links) > len(most_links):
  110. most_links = parsed_links
  111. best_parser_name = parser_name
  112. except Exception as err: # noqa
  113. # Parsers are tried one by one down the list, and the first one
  114. # that succeeds is used. To debug why a certain parser was not used
  115. # due to python error or format incompatibility, uncomment this line:
  116. # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
  117. # raise
  118. pass
  119. timer.end()
  120. return most_links, best_parser_name
  121. @enforce_types
  122. def save_text_as_source(raw_text: str, filename: str='{ts}-stdin.txt', out_dir: Path=OUTPUT_DIR) -> str:
  123. ts = str(datetime.now(timezone.utc).timestamp()).split('.', 1)[0]
  124. source_path = str(out_dir / SOURCES_DIR_NAME / filename.format(ts=ts))
  125. referenced_texts = ''
  126. for entry in raw_text.split():
  127. try:
  128. if Path(entry).exists():
  129. referenced_texts += Path(entry).read_text()
  130. except Exception as err:
  131. print(err)
  132. atomic_write(source_path, raw_text + '\n' + referenced_texts)
  133. log_source_saved(source_file=source_path)
  134. return source_path
  135. @enforce_types
  136. def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{basename}.txt', out_dir: Path=OUTPUT_DIR) -> str:
  137. """download a given url's content into output/sources/domain-<timestamp>.txt"""
  138. ts = str(datetime.now(timezone.utc).timestamp()).split('.', 1)[0]
  139. source_path = str(OUTPUT_DIR / SOURCES_DIR_NAME / filename.format(basename=basename(path), ts=ts))
  140. if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
  141. # Source is a URL that needs to be downloaded
  142. print(f' > Downloading {path} contents')
  143. timer = TimedProgress(timeout, prefix=' ')
  144. try:
  145. raw_source_text = download_url(path, timeout=timeout)
  146. raw_source_text = htmldecode(raw_source_text)
  147. timer.end()
  148. except Exception as e:
  149. timer.end()
  150. print('{}[!] Failed to download {}{}\n'.format(
  151. ANSI['red'],
  152. path,
  153. ANSI['reset'],
  154. ))
  155. print(' ', e)
  156. raise e
  157. else:
  158. # Source is a path to a local file on the filesystem
  159. with open(path, 'r') as f:
  160. raw_source_text = f.read()
  161. atomic_write(source_path, raw_source_text)
  162. log_source_saved(source_file=source_path)
  163. return source_path