__init__.py 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. """
  2. Everything related to parsing links from input sources.
  3. For a list of supported services, see the README.md.
  4. For examples of supported import formats see tests/.
  5. """
  6. __package__ = 'archivebox.parsers'
  7. import re
  8. from io import StringIO
  9. from typing import IO, Tuple, List, Optional
  10. from datetime import datetime, timezone
  11. from pathlib import Path
  12. from ..system import atomic_write
  13. from ..config import (
  14. ANSI,
  15. OUTPUT_DIR,
  16. SOURCES_DIR_NAME,
  17. TIMEOUT,
  18. stderr,
  19. hint,
  20. )
  21. from ..util import (
  22. basename,
  23. htmldecode,
  24. download_url,
  25. enforce_types,
  26. URL_REGEX,
  27. )
  28. from ..index.schema import Link
  29. from ..logging_util import TimedProgress, log_source_saved
  30. from . import pocket_api
  31. from . import readwise_reader_api
  32. from . import wallabag_atom
  33. from . import pocket_html
  34. from . import pinboard_rss
  35. from . import shaarli_rss
  36. from . import medium_rss
  37. from . import netscape_html
  38. from . import generic_rss
  39. from . import generic_json
  40. from . import generic_html
  41. from . import generic_txt
  42. from . import url_list
  43. PARSERS = {
  44. # Specialized parsers
  45. pocket_api.KEY: (pocket_api.NAME, pocket_api.PARSER),
  46. readwise_reader_api.KEY: (readwise_reader_api.NAME, readwise_reader_api.PARSER),
  47. wallabag_atom.KEY: (wallabag_atom.NAME, wallabag_atom.PARSER),
  48. pocket_html.KEY: (pocket_html.NAME, pocket_html.PARSER),
  49. pinboard_rss.KEY: (pinboard_rss.NAME, pinboard_rss.PARSER),
  50. shaarli_rss.KEY: (shaarli_rss.NAME, shaarli_rss.PARSER),
  51. medium_rss.KEY: (medium_rss.NAME, medium_rss.PARSER),
  52. # General parsers
  53. netscape_html.KEY: (netscape_html.NAME, netscape_html.PARSER),
  54. generic_rss.KEY: (generic_rss.NAME, generic_rss.PARSER),
  55. generic_json.KEY: (generic_json.NAME, generic_json.PARSER),
  56. generic_html.KEY: (generic_html.NAME, generic_html.PARSER),
  57. # Catchall fallback parser
  58. generic_txt.KEY: (generic_txt.NAME, generic_txt.PARSER),
  59. # Explicitly specified parsers
  60. url_list.KEY: (url_list.NAME, url_list.PARSER),
  61. }
  62. @enforce_types
  63. def parse_links_memory(urls: List[str], root_url: Optional[str]=None):
  64. """
  65. parse a list of URLS without touching the filesystem
  66. """
  67. timer = TimedProgress(TIMEOUT * 4)
  68. #urls = list(map(lambda x: x + "\n", urls))
  69. file = StringIO()
  70. file.writelines(urls)
  71. file.name = "io_string"
  72. links, parser = run_parser_functions(file, timer, root_url=root_url)
  73. timer.end()
  74. if parser is None:
  75. return [], 'Failed to parse'
  76. return links, parser
  77. @enforce_types
  78. def parse_links(source_file: str, root_url: Optional[str]=None, parser: str="auto") -> Tuple[List[Link], str]:
  79. """parse a list of URLs with their metadata from an
  80. RSS feed, bookmarks export, or text file
  81. """
  82. timer = TimedProgress(TIMEOUT * 4)
  83. with open(source_file, 'r', encoding='utf-8') as file:
  84. links, parser = run_parser_functions(file, timer, root_url=root_url, parser=parser)
  85. timer.end()
  86. if parser is None:
  87. return [], 'Failed to parse'
  88. return links, parser
  89. def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None, parser: str="auto") -> Tuple[List[Link], Optional[str]]:
  90. most_links: List[Link] = []
  91. best_parser_name = None
  92. if parser != "auto":
  93. parser_name, parser_func = PARSERS[parser]
  94. parsed_links = list(parser_func(to_parse, root_url=root_url))
  95. if not parsed_links:
  96. stderr()
  97. stderr(f'[X] No links found using {parser_name} parser', color='red')
  98. hint('Try a different parser or double check the input?')
  99. stderr()
  100. timer.end()
  101. return parsed_links, parser_name
  102. for parser_id in PARSERS:
  103. parser_name, parser_func = PARSERS[parser_id]
  104. try:
  105. parsed_links = list(parser_func(to_parse, root_url=root_url))
  106. if not parsed_links:
  107. raise Exception(f'No links found using {parser_name} parser')
  108. # print(f'[√] Parser {parser_name} succeeded: {len(parsed_links)} links parsed')
  109. if len(parsed_links) > len(most_links):
  110. most_links = parsed_links
  111. best_parser_name = parser_name
  112. except Exception as err: # noqa
  113. # Parsers are tried one by one down the list, and the first one
  114. # that succeeds is used. To debug why a certain parser was not used
  115. # due to python error or format incompatibility, uncomment this line:
  116. # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
  117. # raise
  118. pass
  119. timer.end()
  120. return most_links, best_parser_name
  121. @enforce_types
  122. def save_text_as_source(raw_text: str, filename: str='{ts}-stdin.txt', out_dir: Path=OUTPUT_DIR) -> str:
  123. ts = str(datetime.now(timezone.utc).timestamp()).split('.', 1)[0]
  124. source_path = str(out_dir / SOURCES_DIR_NAME / filename.format(ts=ts))
  125. referenced_texts = ''
  126. for entry in raw_text.split():
  127. try:
  128. if Path(entry).exists():
  129. referenced_texts += Path(entry).read_text()
  130. except Exception as err:
  131. print(err)
  132. atomic_write(source_path, raw_text + '\n' + referenced_texts)
  133. log_source_saved(source_file=source_path)
  134. return source_path
  135. @enforce_types
  136. def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{basename}.txt', out_dir: Path=OUTPUT_DIR) -> str:
  137. """download a given url's content into output/sources/domain-<timestamp>.txt"""
  138. ts = str(datetime.now(timezone.utc).timestamp()).split('.', 1)[0]
  139. source_path = str(OUTPUT_DIR / SOURCES_DIR_NAME / filename.format(basename=basename(path), ts=ts))
  140. if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
  141. # Source is a URL that needs to be downloaded
  142. print(f' > Downloading {path} contents')
  143. timer = TimedProgress(timeout, prefix=' ')
  144. try:
  145. raw_source_text = download_url(path, timeout=timeout)
  146. raw_source_text = htmldecode(raw_source_text)
  147. timer.end()
  148. except Exception as e:
  149. timer.end()
  150. print('{}[!] Failed to download {}{}\n'.format(
  151. ANSI['red'],
  152. path,
  153. ANSI['reset'],
  154. ))
  155. print(' ', e)
  156. raise e
  157. else:
  158. # Source is a path to a local file on the filesystem
  159. with open(path, 'r') as f:
  160. raw_source_text = f.read()
  161. atomic_write(source_path, raw_source_text)
  162. log_source_saved(source_file=source_path)
  163. return source_path
  164. # Check that plain text regex URL parsing works as expected
  165. # this is last-line-of-defense to make sure the URL_REGEX isn't
  166. # misbehaving due to some OS-level or environment level quirks (e.g. bad regex lib)
  167. # the consequences of bad URL parsing could be disastrous and lead to many
  168. # incorrect/badly parsed links being added to the archive, so this is worth the cost of checking
  169. _test_url_strs = {
  170. 'example.com': 0,
  171. '/example.com': 0,
  172. '//example.com': 0,
  173. ':/example.com': 0,
  174. '://example.com': 0,
  175. 'htt://example8.com': 0,
  176. '/htt://example.com': 0,
  177. 'https://example': 1,
  178. 'https://localhost/2345': 1,
  179. 'https://localhost:1234/123': 1,
  180. '://': 0,
  181. 'https://': 0,
  182. 'http://': 0,
  183. 'ftp://': 0,
  184. 'ftp://example.com': 0,
  185. 'https://example.com': 1,
  186. 'https://example.com/': 1,
  187. 'https://a.example.com': 1,
  188. 'https://a.example.com/': 1,
  189. 'https://a.example.com/what/is/happening.html': 1,
  190. 'https://a.example.com/what/ís/happening.html': 1,
  191. 'https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a': 1,
  192. 'https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a': 1,
  193. 'HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b': 1,
  194. 'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
  195. 'https://example.com?what=1#how-about-this=1&2%20baf': 1,
  196. '<test>http://example7.com</test>': 1,
  197. 'https://<test>': 0,
  198. 'https://[test]': 0,
  199. 'http://"test"': 0,
  200. 'http://\'test\'': 0,
  201. '[https://example8.com/what/is/this.php?what=1]': 1,
  202. '[and http://example9.com?what=1&other=3#and-thing=2]': 1,
  203. '<what>https://example10.com#and-thing=2 "</about>': 1,
  204. 'abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def': 1,
  205. 'sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi': 1,
  206. '<or>http://examplehttp://15.badc</that>': 2,
  207. 'https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://': 2,
  208. '[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)': 3,
  209. }
  210. for url_str, num_urls in _test_url_strs.items():
  211. assert len(re.findall(URL_REGEX, url_str)) == num_urls, (
  212. f'{url_str} does not contain {num_urls} urls')