__init__.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. """
  2. Everything related to parsing links from input sources.
  3. For a list of supported services, see the README.md.
  4. For examples of supported import formats see tests/.
  5. """
  6. __package__ = 'archivebox.parsers'
  7. import re
  8. from io import StringIO
  9. from typing import IO, Tuple, List, Optional
  10. from datetime import datetime, timezone
  11. from pathlib import Path
  12. from ..system import atomic_write
  13. from ..config import (
  14. ANSI,
  15. OUTPUT_DIR,
  16. SOURCES_DIR_NAME,
  17. TIMEOUT,
  18. stderr,
  19. hint,
  20. )
  21. from ..util import (
  22. basename,
  23. htmldecode,
  24. download_url,
  25. enforce_types,
  26. URL_REGEX,
  27. )
  28. from ..index.schema import Link
  29. from ..logging_util import TimedProgress, log_source_saved
  30. from . import pocket_api
  31. from . import readwise_reader_api
  32. from . import wallabag_atom
  33. from . import pocket_html
  34. from . import pinboard_rss
  35. from . import shaarli_rss
  36. from . import medium_rss
  37. from . import netscape_html
  38. from . import generic_rss
  39. from . import generic_json
  40. from . import generic_jsonl
  41. from . import generic_html
  42. from . import generic_txt
  43. from . import url_list
  44. PARSERS = {
  45. # Specialized parsers
  46. pocket_api.KEY: (pocket_api.NAME, pocket_api.PARSER),
  47. readwise_reader_api.KEY: (readwise_reader_api.NAME, readwise_reader_api.PARSER),
  48. wallabag_atom.KEY: (wallabag_atom.NAME, wallabag_atom.PARSER),
  49. pocket_html.KEY: (pocket_html.NAME, pocket_html.PARSER),
  50. pinboard_rss.KEY: (pinboard_rss.NAME, pinboard_rss.PARSER),
  51. shaarli_rss.KEY: (shaarli_rss.NAME, shaarli_rss.PARSER),
  52. medium_rss.KEY: (medium_rss.NAME, medium_rss.PARSER),
  53. # General parsers
  54. netscape_html.KEY: (netscape_html.NAME, netscape_html.PARSER),
  55. generic_rss.KEY: (generic_rss.NAME, generic_rss.PARSER),
  56. generic_json.KEY: (generic_json.NAME, generic_json.PARSER),
  57. generic_jsonl.KEY: (generic_jsonl.NAME, generic_jsonl.PARSER),
  58. generic_html.KEY: (generic_html.NAME, generic_html.PARSER),
  59. # Catchall fallback parser
  60. generic_txt.KEY: (generic_txt.NAME, generic_txt.PARSER),
  61. # Explicitly specified parsers
  62. url_list.KEY: (url_list.NAME, url_list.PARSER),
  63. }
  64. @enforce_types
  65. def parse_links_memory(urls: List[str], root_url: Optional[str]=None):
  66. """
  67. parse a list of URLS without touching the filesystem
  68. """
  69. timer = TimedProgress(TIMEOUT * 4)
  70. #urls = list(map(lambda x: x + "\n", urls))
  71. file = StringIO()
  72. file.writelines(urls)
  73. file.name = "io_string"
  74. links, parser = run_parser_functions(file, timer, root_url=root_url)
  75. timer.end()
  76. if parser is None:
  77. return [], 'Failed to parse'
  78. return links, parser
  79. @enforce_types
  80. def parse_links(source_file: str, root_url: Optional[str]=None, parser: str="auto") -> Tuple[List[Link], str]:
  81. """parse a list of URLs with their metadata from an
  82. RSS feed, bookmarks export, or text file
  83. """
  84. timer = TimedProgress(TIMEOUT * 4)
  85. with open(source_file, 'r', encoding='utf-8') as file:
  86. links, parser = run_parser_functions(file, timer, root_url=root_url, parser=parser)
  87. timer.end()
  88. if parser is None:
  89. return [], 'Failed to parse'
  90. return links, parser
  91. def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None, parser: str="auto") -> Tuple[List[Link], Optional[str]]:
  92. most_links: List[Link] = []
  93. best_parser_name = None
  94. if parser != "auto":
  95. parser_name, parser_func = PARSERS[parser]
  96. parsed_links = list(parser_func(to_parse, root_url=root_url))
  97. if not parsed_links:
  98. stderr()
  99. stderr(f'[X] No links found using {parser_name} parser', color='red')
  100. hint('Try a different parser or double check the input?')
  101. stderr()
  102. timer.end()
  103. return parsed_links, parser_name
  104. for parser_id in PARSERS:
  105. parser_name, parser_func = PARSERS[parser_id]
  106. try:
  107. parsed_links = list(parser_func(to_parse, root_url=root_url))
  108. if not parsed_links:
  109. raise Exception(f'No links found using {parser_name} parser')
  110. # print(f'[√] Parser {parser_name} succeeded: {len(parsed_links)} links parsed')
  111. if len(parsed_links) > len(most_links):
  112. most_links = parsed_links
  113. best_parser_name = parser_name
  114. except Exception as err: # noqa
  115. # Parsers are tried one by one down the list, and the first one
  116. # that succeeds is used. To debug why a certain parser was not used
  117. # due to python error or format incompatibility, uncomment this line:
  118. # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
  119. # raise
  120. pass
  121. timer.end()
  122. return most_links, best_parser_name
  123. @enforce_types
  124. def save_text_as_source(raw_text: str, filename: str='{ts}-stdin.txt', out_dir: Path=OUTPUT_DIR) -> str:
  125. ts = str(datetime.now(timezone.utc).timestamp()).split('.', 1)[0]
  126. source_path = str(out_dir / SOURCES_DIR_NAME / filename.format(ts=ts))
  127. referenced_texts = ''
  128. for entry in raw_text.split():
  129. try:
  130. if Path(entry).exists():
  131. referenced_texts += Path(entry).read_text()
  132. except Exception as err:
  133. print(err)
  134. atomic_write(source_path, raw_text + '\n' + referenced_texts)
  135. log_source_saved(source_file=source_path)
  136. return source_path
  137. @enforce_types
  138. def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{basename}.txt', out_dir: Path=OUTPUT_DIR) -> str:
  139. """download a given url's content into output/sources/domain-<timestamp>.txt"""
  140. ts = str(datetime.now(timezone.utc).timestamp()).split('.', 1)[0]
  141. source_path = str(OUTPUT_DIR / SOURCES_DIR_NAME / filename.format(basename=basename(path), ts=ts))
  142. if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
  143. # Source is a URL that needs to be downloaded
  144. print(f' > Downloading {path} contents')
  145. timer = TimedProgress(timeout, prefix=' ')
  146. try:
  147. raw_source_text = download_url(path, timeout=timeout)
  148. raw_source_text = htmldecode(raw_source_text)
  149. timer.end()
  150. except Exception as e:
  151. timer.end()
  152. print('{}[!] Failed to download {}{}\n'.format(
  153. ANSI['red'],
  154. path,
  155. ANSI['reset'],
  156. ))
  157. print(' ', e)
  158. raise e
  159. else:
  160. # Source is a path to a local file on the filesystem
  161. with open(path, 'r') as f:
  162. raw_source_text = f.read()
  163. atomic_write(source_path, raw_source_text)
  164. log_source_saved(source_file=source_path)
  165. return source_path
  166. # Check that plain text regex URL parsing works as expected
  167. # this is last-line-of-defense to make sure the URL_REGEX isn't
  168. # misbehaving due to some OS-level or environment level quirks (e.g. bad regex lib)
  169. # the consequences of bad URL parsing could be disastrous and lead to many
  170. # incorrect/badly parsed links being added to the archive, so this is worth the cost of checking
  171. _test_url_strs = {
  172. 'example.com': 0,
  173. '/example.com': 0,
  174. '//example.com': 0,
  175. ':/example.com': 0,
  176. '://example.com': 0,
  177. 'htt://example8.com': 0,
  178. '/htt://example.com': 0,
  179. 'https://example': 1,
  180. 'https://localhost/2345': 1,
  181. 'https://localhost:1234/123': 1,
  182. '://': 0,
  183. 'https://': 0,
  184. 'http://': 0,
  185. 'ftp://': 0,
  186. 'ftp://example.com': 0,
  187. 'https://example.com': 1,
  188. 'https://example.com/': 1,
  189. 'https://a.example.com': 1,
  190. 'https://a.example.com/': 1,
  191. 'https://a.example.com/what/is/happening.html': 1,
  192. 'https://a.example.com/what/ís/happening.html': 1,
  193. 'https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a': 1,
  194. 'https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a': 1,
  195. 'HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b': 1,
  196. 'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
  197. 'https://example.com?what=1#how-about-this=1&2%20baf': 1,
  198. '<test>http://example7.com</test>': 1,
  199. 'https://<test>': 0,
  200. 'https://[test]': 0,
  201. 'http://"test"': 0,
  202. 'http://\'test\'': 0,
  203. '[https://example8.com/what/is/this.php?what=1]': 1,
  204. '[and http://example9.com?what=1&other=3#and-thing=2]': 1,
  205. '<what>https://example10.com#and-thing=2 "</about>': 1,
  206. 'abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def': 1,
  207. 'sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi': 1,
  208. '<or>http://examplehttp://15.badc</that>': 2,
  209. 'https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://': 2,
  210. '[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)': 3,
  211. }
  212. for url_str, num_urls in _test_url_strs.items():
  213. assert len(re.findall(URL_REGEX, url_str)) == num_urls, (
  214. f'{url_str} does not contain {num_urls} urls')