__init__.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. """
  2. Everything related to parsing links from input sources.
  3. For a list of supported services, see the README.md.
  4. For examples of supported import formats see tests/.
  5. """
  6. __package__ = 'archivebox.parsers'
  7. from io import StringIO
  8. from typing import IO, Tuple, List, Optional
  9. from datetime import datetime, timezone
  10. from pathlib import Path
  11. from archivebox.config import DATA_DIR, CONSTANTS
  12. from archivebox.config.common import SHELL_CONFIG, ARCHIVING_CONFIG
  13. from archivebox.misc.system import atomic_write
  14. from archivebox.misc.logging import stderr, hint
  15. from archivebox.misc.logging_util import TimedProgress, log_source_saved
  16. from archivebox.misc.util import (
  17. basename,
  18. htmldecode,
  19. download_url,
  20. enforce_types,
  21. )
  22. from ..index.schema import Link
  23. from . import pocket_api
  24. from . import readwise_reader_api
  25. from . import wallabag_atom
  26. from . import pocket_html
  27. from . import pinboard_rss
  28. from . import shaarli_rss
  29. from . import medium_rss
  30. from . import netscape_html
  31. from . import generic_rss
  32. from . import generic_json
  33. from . import generic_jsonl
  34. from . import generic_html
  35. from . import generic_txt
  36. from . import url_list
  37. PARSERS = {
  38. # Specialized parsers
  39. pocket_api.KEY: (pocket_api.NAME, pocket_api.PARSER),
  40. readwise_reader_api.KEY: (readwise_reader_api.NAME, readwise_reader_api.PARSER),
  41. wallabag_atom.KEY: (wallabag_atom.NAME, wallabag_atom.PARSER),
  42. pocket_html.KEY: (pocket_html.NAME, pocket_html.PARSER),
  43. pinboard_rss.KEY: (pinboard_rss.NAME, pinboard_rss.PARSER),
  44. shaarli_rss.KEY: (shaarli_rss.NAME, shaarli_rss.PARSER),
  45. medium_rss.KEY: (medium_rss.NAME, medium_rss.PARSER),
  46. # General parsers
  47. netscape_html.KEY: (netscape_html.NAME, netscape_html.PARSER),
  48. generic_rss.KEY: (generic_rss.NAME, generic_rss.PARSER),
  49. generic_json.KEY: (generic_json.NAME, generic_json.PARSER),
  50. generic_jsonl.KEY: (generic_jsonl.NAME, generic_jsonl.PARSER),
  51. generic_html.KEY: (generic_html.NAME, generic_html.PARSER),
  52. # Catchall fallback parser
  53. generic_txt.KEY: (generic_txt.NAME, generic_txt.PARSER),
  54. # Explicitly specified parsers
  55. url_list.KEY: (url_list.NAME, url_list.PARSER),
  56. }
  57. @enforce_types
  58. def parse_links_memory(urls: List[str], root_url: Optional[str]=None):
  59. """
  60. parse a list of URLS without touching the filesystem
  61. """
  62. timer = TimedProgress(ARCHIVING_CONFIG.TIMEOUT * 4)
  63. #urls = list(map(lambda x: x + "\n", urls))
  64. file = StringIO()
  65. file.writelines(urls)
  66. file.name = "io_string"
  67. links, parser = run_parser_functions(file, timer, root_url=root_url)
  68. timer.end()
  69. if parser is None:
  70. return [], 'Failed to parse'
  71. return links, parser
  72. @enforce_types
  73. def parse_links(source_file: str, root_url: Optional[str]=None, parser: str="auto") -> Tuple[List[Link], str]:
  74. """parse a list of URLs with their metadata from an
  75. RSS feed, bookmarks export, or text file
  76. """
  77. timer = TimedProgress(ARCHIVING_CONFIG.TIMEOUT * 4)
  78. with open(source_file, 'r', encoding='utf-8') as file:
  79. links, parser = run_parser_functions(file, timer, root_url=root_url, parser=parser)
  80. timer.end()
  81. if parser is None:
  82. return [], 'Failed to parse'
  83. return links, parser
  84. def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None, parser: str="auto") -> Tuple[List[Link], Optional[str]]:
  85. most_links: List[Link] = []
  86. best_parser_name = None
  87. if parser != "auto":
  88. parser_name, parser_func = PARSERS[parser]
  89. parsed_links = list(parser_func(to_parse, root_url=root_url))
  90. if not parsed_links:
  91. stderr()
  92. stderr(f'[X] No links found using {parser_name} parser', color='red')
  93. hint('Try a different parser or double check the input?')
  94. stderr()
  95. timer.end()
  96. return parsed_links, parser_name
  97. for parser_id in PARSERS:
  98. parser_name, parser_func = PARSERS[parser_id]
  99. try:
  100. parsed_links = list(parser_func(to_parse, root_url=root_url))
  101. if not parsed_links:
  102. raise Exception(f'No links found using {parser_name} parser')
  103. # print(f'[√] Parser {parser_name} succeeded: {len(parsed_links)} links parsed')
  104. if len(parsed_links) > len(most_links):
  105. most_links = parsed_links
  106. best_parser_name = parser_name
  107. except Exception as err: # noqa
  108. # Parsers are tried one by one down the list, and the first one
  109. # that succeeds is used. To debug why a certain parser was not used
  110. # due to python error or format incompatibility, uncomment this line:
  111. # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
  112. # raise
  113. pass
  114. timer.end()
  115. return most_links, best_parser_name
  116. @enforce_types
  117. def save_text_as_source(raw_text: str, filename: str='{ts}-stdin.txt', out_dir: Path=DATA_DIR) -> str:
  118. ts = str(datetime.now(timezone.utc).timestamp()).split('.', 1)[0]
  119. source_path = str(CONSTANTS.SOURCES_DIR / filename.format(ts=ts))
  120. referenced_texts = ''
  121. # dont attempt to read local files from the text, security risk:
  122. # for entry in raw_text.split():
  123. # try:
  124. # if Path(entry).exists():
  125. # referenced_texts += Path(entry).read_text()
  126. # except Exception as err:
  127. # print(err)
  128. atomic_write(source_path, raw_text + '\n' + referenced_texts)
  129. log_source_saved(source_file=source_path)
  130. return source_path
  131. @enforce_types
  132. def save_file_as_source(path: str, timeout: int=ARCHIVING_CONFIG.TIMEOUT, filename: str='{ts}-{basename}.txt', out_dir: Path=DATA_DIR) -> str:
  133. """download a given url's content into output/sources/domain-<timestamp>.txt"""
  134. ts = str(datetime.now(timezone.utc).timestamp()).split('.', 1)[0]
  135. source_path = str(CONSTANTS.SOURCES_DIR / filename.format(basename=basename(path), ts=ts))
  136. if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
  137. # Source is a URL that needs to be downloaded
  138. print(f' > Downloading {path} contents')
  139. timer = TimedProgress(timeout, prefix=' ')
  140. try:
  141. raw_source_text = download_url(path, timeout=timeout)
  142. raw_source_text = htmldecode(raw_source_text)
  143. timer.end()
  144. except Exception as e:
  145. timer.end()
  146. print('{}[!] Failed to download {}{}\n'.format(
  147. SHELL_CONFIG.ANSI['red'],
  148. path,
  149. SHELL_CONFIG.ANSI['reset'],
  150. ))
  151. print(' ', e)
  152. raise e
  153. else:
  154. # Source is a path to a local file on the filesystem
  155. with open(path, 'r') as f:
  156. raw_source_text = f.read()
  157. atomic_write(source_path, raw_source_text)
  158. log_source_saved(source_file=source_path)
  159. return source_path