parse.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331
  1. """
  2. Everything related to parsing links from input sources.
  3. For a list of supported services, see the README.md.
  4. For examples of supported import formats see tests/.
  5. Link: {
  6. 'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
  7. 'timestamp': '1544212312.4234',
  8. 'title': 'Example.com Page Title',
  9. 'tags': 'abc,def',
  10. 'sources': [
  11. 'output/sources/ril_export.html',
  12. 'output/sources/getpocket.com-1523422111.txt',
  13. 'output/sources/stdin-234234112312.txt'
  14. ]
  15. }
  16. """
  17. import re
  18. import json
  19. from typing import Tuple, List, IO, Iterable
  20. from datetime import datetime
  21. import xml.etree.ElementTree as etree
  22. from .config import TIMEOUT
  23. from .util import (
  24. htmldecode,
  25. str_between,
  26. URL_REGEX,
  27. check_url_parsing_invariants,
  28. TimedProgress,
  29. Link,
  30. enforce_types,
  31. )
  32. @enforce_types
  33. def parse_links(source_file: str) -> Tuple[List[Link], str]:
  34. """parse a list of URLs with their metadata from an
  35. RSS feed, bookmarks export, or text file
  36. """
  37. check_url_parsing_invariants()
  38. PARSERS = (
  39. # Specialized parsers
  40. ('Pocket HTML', parse_pocket_html_export),
  41. ('Pinboard RSS', parse_pinboard_rss_export),
  42. ('Shaarli RSS', parse_shaarli_rss_export),
  43. ('Medium RSS', parse_medium_rss_export),
  44. # General parsers
  45. ('Netscape HTML', parse_netscape_html_export),
  46. ('Generic RSS', parse_rss_export),
  47. ('Generic JSON', parse_json_export),
  48. # Fallback parser
  49. ('Plain Text', parse_plain_text_export),
  50. )
  51. timer = TimedProgress(TIMEOUT * 4)
  52. with open(source_file, 'r', encoding='utf-8') as file:
  53. for parser_name, parser_func in PARSERS:
  54. try:
  55. links = list(parser_func(file))
  56. if links:
  57. timer.end()
  58. return links, parser_name
  59. except Exception as err: # noqa
  60. # Parsers are tried one by one down the list, and the first one
  61. # that succeeds is used. To see why a certain parser was not used
  62. # due to error or format incompatibility, uncomment this line:
  63. # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
  64. pass
  65. timer.end()
  66. return [], 'Failed to parse'
  67. ### Import Parser Functions
  68. @enforce_types
  69. def parse_pocket_html_export(html_file: IO[str]) -> Iterable[Link]:
  70. """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
  71. html_file.seek(0)
  72. pattern = re.compile("^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE)
  73. for line in html_file:
  74. # example line
  75. # <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
  76. match = pattern.search(line)
  77. if match:
  78. url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url
  79. time = datetime.fromtimestamp(float(match.group(2)))
  80. tags = match.group(3)
  81. title = match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '')
  82. yield Link(
  83. url=htmldecode(url),
  84. timestamp=str(time.timestamp()),
  85. title=htmldecode(title) or None,
  86. tags=tags or '',
  87. sources=[html_file.name],
  88. )
  89. @enforce_types
  90. def parse_json_export(json_file: IO[str]) -> Iterable[Link]:
  91. """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
  92. json_file.seek(0)
  93. links = json.load(json_file)
  94. json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
  95. for link in links:
  96. # example line
  97. # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
  98. if link:
  99. # Parse URL
  100. url = link.get('href') or link.get('url') or link.get('URL')
  101. if not url:
  102. raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
  103. # Parse the timestamp
  104. ts_str = str(datetime.now().timestamp())
  105. if link.get('timestamp'):
  106. # chrome/ff histories use a very precise timestamp
  107. ts_str = str(link['timestamp'] / 10000000)
  108. elif link.get('time'):
  109. ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
  110. elif link.get('created_at'):
  111. ts_str = str(json_date(link['created_at']).timestamp())
  112. elif link.get('created'):
  113. ts_str = str(json_date(link['created']).timestamp())
  114. elif link.get('date'):
  115. ts_str = str(json_date(link['date']).timestamp())
  116. elif link.get('bookmarked'):
  117. ts_str = str(json_date(link['bookmarked']).timestamp())
  118. elif link.get('saved'):
  119. ts_str = str(json_date(link['saved']).timestamp())
  120. # Parse the title
  121. title = None
  122. if link.get('title'):
  123. title = link['title'].strip()
  124. elif link.get('description'):
  125. title = link['description'].replace(' — Readability', '').strip()
  126. elif link.get('name'):
  127. title = link['name'].strip()
  128. yield Link(
  129. url=htmldecode(url),
  130. timestamp=ts_str,
  131. title=htmldecode(title) or None,
  132. tags=htmldecode(link.get('tags')) or '',
  133. sources=[json_file.name],
  134. )
  135. @enforce_types
  136. def parse_rss_export(rss_file: IO[str]) -> Iterable[Link]:
  137. """Parse RSS XML-format files into links"""
  138. rss_file.seek(0)
  139. items = rss_file.read().split('<item>')
  140. items = items[1:] if items else []
  141. for item in items:
  142. # example item:
  143. # <item>
  144. # <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
  145. # <category>Unread</category>
  146. # <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
  147. # <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
  148. # <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
  149. # </item>
  150. trailing_removed = item.split('</item>', 1)[0]
  151. leading_removed = trailing_removed.split('<item>', 1)[-1].strip()
  152. rows = leading_removed.split('\n')
  153. def get_row(key):
  154. return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
  155. url = str_between(get_row('link'), '<link>', '</link>')
  156. ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
  157. time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
  158. title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
  159. yield Link(
  160. url=htmldecode(url),
  161. timestamp=str(time.timestamp()),
  162. title=htmldecode(title) or None,
  163. tags=None,
  164. sources=[rss_file.name],
  165. )
  166. @enforce_types
  167. def parse_shaarli_rss_export(rss_file: IO[str]) -> Iterable[Link]:
  168. """Parse Shaarli-specific RSS XML-format files into links"""
  169. rss_file.seek(0)
  170. entries = rss_file.read().split('<entry>')[1:]
  171. for entry in entries:
  172. # example entry:
  173. # <entry>
  174. # <title>Aktuelle Trojaner-Welle: Emotet lauert in gefälschten Rechnungsmails | heise online</title>
  175. # <link href="https://www.heise.de/security/meldung/Aktuelle-Trojaner-Welle-Emotet-lauert-in-gefaelschten-Rechnungsmails-4291268.html" />
  176. # <id>https://demo.shaarli.org/?cEV4vw</id>
  177. # <published>2019-01-30T06:06:01+00:00</published>
  178. # <updated>2019-01-30T06:06:01+00:00</updated>
  179. # <content type="html" xml:lang="en"><![CDATA[<div class="markdown"><p>&#8212; <a href="https://demo.shaarli.org/?cEV4vw">Permalink</a></p></div>]]></content>
  180. # </entry>
  181. trailing_removed = entry.split('</entry>', 1)[0]
  182. leading_removed = trailing_removed.strip()
  183. rows = leading_removed.split('\n')
  184. def get_row(key):
  185. return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0]
  186. title = str_between(get_row('title'), '<title>', '</title>').strip()
  187. url = str_between(get_row('link'), '<link href="', '" />')
  188. ts_str = str_between(get_row('published'), '<published>', '</published>')
  189. time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
  190. yield Link(
  191. url=htmldecode(url),
  192. timestamp=str(time.timestamp()),
  193. title=htmldecode(title) or None,
  194. tags=None,
  195. sources=[rss_file.name],
  196. )
  197. @enforce_types
  198. def parse_netscape_html_export(html_file: IO[str]) -> Iterable[Link]:
  199. """Parse netscape-format bookmarks export files (produced by all browsers)"""
  200. html_file.seek(0)
  201. pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE)
  202. for line in html_file:
  203. # example line
  204. # <DT><A HREF="https://example.com/?q=1+2" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://example.com/favicon.ico" ICON="data:image/png;base64,...">example bookmark title</A>
  205. match = pattern.search(line)
  206. if match:
  207. url = match.group(1)
  208. time = datetime.fromtimestamp(float(match.group(2)))
  209. title = match.group(3).strip()
  210. yield Link(
  211. url=htmldecode(url),
  212. timestamp=str(time.timestamp()),
  213. title=htmldecode(title) or None,
  214. tags=None,
  215. sources=[html_file.name],
  216. )
  217. @enforce_types
  218. def parse_pinboard_rss_export(rss_file: IO[str]) -> Iterable[Link]:
  219. """Parse Pinboard RSS feed files into links"""
  220. rss_file.seek(0)
  221. root = etree.parse(rss_file).getroot()
  222. items = root.findall("{http://purl.org/rss/1.0/}item")
  223. for item in items:
  224. find = lambda p: item.find(p).text.strip() if item.find(p) else None # type: ignore
  225. url = find("{http://purl.org/rss/1.0/}link")
  226. tags = find("{http://purl.org/dc/elements/1.1/}subject")
  227. title = find("{http://purl.org/rss/1.0/}title")
  228. ts_str = find("{http://purl.org/dc/elements/1.1/}date")
  229. # Pinboard includes a colon in its date stamp timezone offsets, which
  230. # Python can't parse. Remove it:
  231. if ts_str and ts_str[-3:-2] == ":":
  232. ts_str = ts_str[:-3]+ts_str[-2:]
  233. if ts_str:
  234. time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
  235. else:
  236. time = datetime.now()
  237. yield Link(
  238. url=htmldecode(url),
  239. timestamp=str(time.timestamp()),
  240. title=htmldecode(title) or None,
  241. tags=htmldecode(tags) or None,
  242. sources=[rss_file.name],
  243. )
  244. @enforce_types
  245. def parse_medium_rss_export(rss_file: IO[str]) -> Iterable[Link]:
  246. """Parse Medium RSS feed files into links"""
  247. rss_file.seek(0)
  248. root = etree.parse(rss_file).getroot()
  249. items = root.find("channel").findall("item") # type: ignore
  250. for item in items:
  251. url = item.find("link").text # type: ignore
  252. title = item.find("title").text.strip() # type: ignore
  253. ts_str = item.find("pubDate").text # type: ignore
  254. time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z") # type: ignore
  255. yield Link(
  256. url=htmldecode(url),
  257. timestamp=str(time.timestamp()),
  258. title=htmldecode(title) or None,
  259. tags=None,
  260. sources=[rss_file.name],
  261. )
  262. @enforce_types
  263. def parse_plain_text_export(text_file: IO[str]) -> Iterable[Link]:
  264. """Parse raw links from each line in a text file"""
  265. text_file.seek(0)
  266. for line in text_file.readlines():
  267. urls = re.findall(URL_REGEX, line) if line.strip() else ()
  268. for url in urls: # type: ignore
  269. yield Link(
  270. url=htmldecode(url),
  271. timestamp=str(datetime.now().timestamp()),
  272. title=None,
  273. tags=None,
  274. sources=[text_file.name],
  275. )