parse.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319
  1. """
  2. Everything related to parsing links from input sources.
  3. For a list of supported services, see the README.md.
  4. For examples of supported import formats see tests/.
  5. Link: {
  6. 'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
  7. 'timestamp': '1544212312.4234',
  8. 'title': 'Example.com Page Title',
  9. 'tags': 'abc,def',
  10. 'sources': [
  11. 'output/sources/ril_export.html',
  12. 'output/sources/getpocket.com-1523422111.txt',
  13. 'output/sources/stdin-234234112312.txt'
  14. ]
  15. }
  16. """
  17. import re
  18. import json
  19. from typing import Tuple, List, IO, Iterable
  20. from datetime import datetime
  21. import xml.etree.ElementTree as etree
  22. from config import TIMEOUT
  23. from util import (
  24. htmldecode,
  25. str_between,
  26. URL_REGEX,
  27. check_url_parsing_invariants,
  28. TimedProgress,
  29. Link,
  30. )
  31. def parse_links(source_file: str) -> Tuple[List[Link], str]:
  32. """parse a list of URLs with their metadata from an
  33. RSS feed, bookmarks export, or text file
  34. """
  35. check_url_parsing_invariants()
  36. PARSERS = (
  37. # Specialized parsers
  38. ('Pocket HTML', parse_pocket_html_export),
  39. ('Pinboard RSS', parse_pinboard_rss_export),
  40. ('Shaarli RSS', parse_shaarli_rss_export),
  41. ('Medium RSS', parse_medium_rss_export),
  42. # General parsers
  43. ('Netscape HTML', parse_netscape_html_export),
  44. ('Generic RSS', parse_rss_export),
  45. ('Generic JSON', parse_json_export),
  46. # Fallback parser
  47. ('Plain Text', parse_plain_text_export),
  48. )
  49. timer = TimedProgress(TIMEOUT * 4)
  50. with open(source_file, 'r', encoding='utf-8') as file:
  51. for parser_name, parser_func in PARSERS:
  52. try:
  53. links = list(parser_func(file))
  54. if links:
  55. timer.end()
  56. return links, parser_name
  57. except Exception as err:
  58. # Parsers are tried one by one down the list, and the first one
  59. # that succeeds is used. To see why a certain parser was not used
  60. # due to error or format incompatibility, uncomment this line:
  61. # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
  62. pass
  63. timer.end()
  64. return [], 'Failed to parse'
  65. ### Import Parser Functions
  66. def parse_pocket_html_export(html_file: IO[str]) -> Iterable[Link]:
  67. """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
  68. html_file.seek(0)
  69. pattern = re.compile("^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE)
  70. for line in html_file:
  71. # example line
  72. # <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
  73. match = pattern.search(line)
  74. if match:
  75. url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url
  76. time = datetime.fromtimestamp(float(match.group(2)))
  77. tags = match.group(3)
  78. title = match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '')
  79. yield Link(
  80. url=url,
  81. timestamp=str(time.timestamp()),
  82. title=title or None,
  83. tags=tags or '',
  84. sources=[html_file.name],
  85. )
  86. def parse_json_export(json_file: IO[str]) -> Iterable[Link]:
  87. """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
  88. json_file.seek(0)
  89. links = json.load(json_file)
  90. json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
  91. for link in links:
  92. # example line
  93. # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
  94. if link:
  95. # Parse URL
  96. url = link.get('href') or link.get('url') or link.get('URL')
  97. if not url:
  98. raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
  99. # Parse the timestamp
  100. ts_str = str(datetime.now().timestamp())
  101. if link.get('timestamp'):
  102. # chrome/ff histories use a very precise timestamp
  103. ts_str = str(link['timestamp'] / 10000000)
  104. elif link.get('time'):
  105. ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
  106. elif link.get('created_at'):
  107. ts_str = str(json_date(link['created_at']).timestamp())
  108. elif link.get('created'):
  109. ts_str = str(json_date(link['created']).timestamp())
  110. elif link.get('date'):
  111. ts_str = str(json_date(link['date']).timestamp())
  112. elif link.get('bookmarked'):
  113. ts_str = str(json_date(link['bookmarked']).timestamp())
  114. elif link.get('saved'):
  115. ts_str = str(json_date(link['saved']).timestamp())
  116. # Parse the title
  117. title = None
  118. if link.get('title'):
  119. title = link['title'].strip()
  120. elif link.get('description'):
  121. title = link['description'].replace(' — Readability', '').strip()
  122. elif link.get('name'):
  123. title = link['name'].strip()
  124. yield Link(
  125. url=url,
  126. timestamp=ts_str,
  127. title=htmldecode(title) or None,
  128. tags=link.get('tags') or '',
  129. sources=[json_file.name],
  130. )
  131. def parse_rss_export(rss_file: IO[str]) -> Iterable[Link]:
  132. """Parse RSS XML-format files into links"""
  133. rss_file.seek(0)
  134. items = rss_file.read().split('<item>')
  135. items = items[1:] if items else []
  136. for item in items:
  137. # example item:
  138. # <item>
  139. # <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
  140. # <category>Unread</category>
  141. # <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
  142. # <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
  143. # <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
  144. # </item>
  145. trailing_removed = item.split('</item>', 1)[0]
  146. leading_removed = trailing_removed.split('<item>', 1)[-1].strip()
  147. rows = leading_removed.split('\n')
  148. def get_row(key):
  149. return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
  150. url = str_between(get_row('link'), '<link>', '</link>')
  151. ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
  152. time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
  153. title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
  154. yield Link(
  155. url=url,
  156. timestamp=str(time.timestamp()),
  157. title=htmldecode(title) or None,
  158. tags='',
  159. sources=[rss_file.name],
  160. )
  161. def parse_shaarli_rss_export(rss_file: IO[str]) -> Iterable[Link]:
  162. """Parse Shaarli-specific RSS XML-format files into links"""
  163. rss_file.seek(0)
  164. entries = rss_file.read().split('<entry>')[1:]
  165. for entry in entries:
  166. # example entry:
  167. # <entry>
  168. # <title>Aktuelle Trojaner-Welle: Emotet lauert in gefälschten Rechnungsmails | heise online</title>
  169. # <link href="https://www.heise.de/security/meldung/Aktuelle-Trojaner-Welle-Emotet-lauert-in-gefaelschten-Rechnungsmails-4291268.html" />
  170. # <id>https://demo.shaarli.org/?cEV4vw</id>
  171. # <published>2019-01-30T06:06:01+00:00</published>
  172. # <updated>2019-01-30T06:06:01+00:00</updated>
  173. # <content type="html" xml:lang="en"><![CDATA[<div class="markdown"><p>&#8212; <a href="https://demo.shaarli.org/?cEV4vw">Permalink</a></p></div>]]></content>
  174. # </entry>
  175. trailing_removed = entry.split('</entry>', 1)[0]
  176. leading_removed = trailing_removed.strip()
  177. rows = leading_removed.split('\n')
  178. def get_row(key):
  179. return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0]
  180. title = str_between(get_row('title'), '<title>', '</title>').strip()
  181. url = str_between(get_row('link'), '<link href="', '" />')
  182. ts_str = str_between(get_row('published'), '<published>', '</published>')
  183. time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
  184. yield Link(
  185. url=url,
  186. timestamp=str(time.timestamp()),
  187. title=htmldecode(title) or None,
  188. tags='',
  189. sources=[rss_file.name],
  190. )
  191. def parse_netscape_html_export(html_file: IO[str]) -> Iterable[Link]:
  192. """Parse netscape-format bookmarks export files (produced by all browsers)"""
  193. html_file.seek(0)
  194. pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE)
  195. for line in html_file:
  196. # example line
  197. # <DT><A HREF="https://example.com/?q=1+2" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://example.com/favicon.ico" ICON="data:image/png;base64,...">example bookmark title</A>
  198. match = pattern.search(line)
  199. if match:
  200. url = match.group(1)
  201. time = datetime.fromtimestamp(float(match.group(2)))
  202. title = match.group(3).strip()
  203. yield Link(
  204. url=url,
  205. timestamp=str(time.timestamp()),
  206. title=htmldecode(title) or None,
  207. tags='',
  208. sources=[html_file.name],
  209. )
  210. def parse_pinboard_rss_export(rss_file: IO[str]) -> Iterable[Link]:
  211. """Parse Pinboard RSS feed files into links"""
  212. rss_file.seek(0)
  213. root = etree.parse(rss_file).getroot()
  214. items = root.findall("{http://purl.org/rss/1.0/}item")
  215. for item in items:
  216. url = item.find("{http://purl.org/rss/1.0/}link").text
  217. tags = item.find("{http://purl.org/dc/elements/1.1/}subject").text if item.find("{http://purl.org/dc/elements/1.1/}subject") else None
  218. title = item.find("{http://purl.org/rss/1.0/}title").text.strip() if item.find("{http://purl.org/rss/1.0/}title").text.strip() else None
  219. ts_str = item.find("{http://purl.org/dc/elements/1.1/}date").text if item.find("{http://purl.org/dc/elements/1.1/}date").text else None
  220. # Pinboard includes a colon in its date stamp timezone offsets, which
  221. # Python can't parse. Remove it:
  222. if ts_str and ts_str[-3:-2] == ":":
  223. ts_str = ts_str[:-3]+ts_str[-2:]
  224. if ts_str:
  225. time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
  226. else:
  227. time = datetime.now()
  228. yield Link(
  229. url=url,
  230. timestamp=str(time.timestamp()),
  231. title=htmldecode(title) or None,
  232. tags=tags or '',
  233. sources=[rss_file.name],
  234. )
  235. def parse_medium_rss_export(rss_file: IO[str]) -> Iterable[Link]:
  236. """Parse Medium RSS feed files into links"""
  237. rss_file.seek(0)
  238. root = etree.parse(rss_file).getroot()
  239. items = root.find("channel").findall("item")
  240. for item in items:
  241. url = item.find("link").text
  242. title = item.find("title").text.strip()
  243. ts_str = item.find("pubDate").text
  244. time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z")
  245. yield Link(
  246. url=url,
  247. timestamp=str(time.timestamp()),
  248. title=htmldecode(title) or None,
  249. tags='',
  250. sources=[rss_file.name],
  251. )
  252. def parse_plain_text_export(text_file: IO[str]) -> Iterable[Link]:
  253. """Parse raw links from each line in a text file"""
  254. text_file.seek(0)
  255. for line in text_file.readlines():
  256. urls = re.findall(URL_REGEX, line) if line.strip() else ()
  257. for url in urls:
  258. yield Link(
  259. url=url,
  260. timestamp=str(datetime.now().timestamp()),
  261. title=None,
  262. tags='',
  263. sources=[text_file.name],
  264. )