parse.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
  1. """
  2. Everything related to parsing links from input sources.
  3. For a list of supported services, see the README.md.
  4. For examples of supported import formats see tests/.
  5. Link: {
  6. 'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
  7. 'timestamp': '1544212312.4234',
  8. 'title': 'Example.com Page Title',
  9. 'tags': 'abc,def',
  10. 'sources': [
  11. 'output/sources/ril_export.html',
  12. 'output/sources/getpocket.com-1523422111.txt',
  13. 'output/sources/stdin-234234112312.txt'
  14. ]
  15. }
  16. """
  17. import re
  18. import json
  19. from typing import Tuple, List, IO, Iterable
  20. from datetime import datetime
  21. import xml.etree.ElementTree as etree
  22. from config import TIMEOUT
  23. from util import (
  24. str_between,
  25. URL_REGEX,
  26. check_url_parsing_invariants,
  27. TimedProgress,
  28. Link,
  29. )
  30. def parse_links(source_file: str) -> Tuple[List[Link], str]:
  31. """parse a list of URLs with their metadata from an
  32. RSS feed, bookmarks export, or text file
  33. """
  34. check_url_parsing_invariants()
  35. PARSERS = (
  36. # Specialized parsers
  37. ('Pocket HTML', parse_pocket_html_export),
  38. ('Pinboard RSS', parse_pinboard_rss_export),
  39. ('Shaarli RSS', parse_shaarli_rss_export),
  40. ('Medium RSS', parse_medium_rss_export),
  41. # General parsers
  42. ('Netscape HTML', parse_netscape_html_export),
  43. ('Generic RSS', parse_rss_export),
  44. ('Generic JSON', parse_json_export),
  45. # Fallback parser
  46. ('Plain Text', parse_plain_text_export),
  47. )
  48. timer = TimedProgress(TIMEOUT * 4)
  49. with open(source_file, 'r', encoding='utf-8') as file:
  50. for parser_name, parser_func in PARSERS:
  51. try:
  52. links = list(parser_func(file))
  53. if links:
  54. timer.end()
  55. return links, parser_name
  56. except Exception as err:
  57. # Parsers are tried one by one down the list, and the first one
  58. # that succeeds is used. To see why a certain parser was not used
  59. # due to error or format incompatibility, uncomment this line:
  60. # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
  61. pass
  62. timer.end()
  63. return [], 'Failed to parse'
  64. ### Import Parser Functions
  65. def parse_pocket_html_export(html_file: IO[str]) -> Iterable[Link]:
  66. """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
  67. html_file.seek(0)
  68. pattern = re.compile("^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE)
  69. for line in html_file:
  70. # example line
  71. # <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
  72. match = pattern.search(line)
  73. if match:
  74. url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url
  75. time = datetime.fromtimestamp(float(match.group(2)))
  76. tags = match.group(3)
  77. title = match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '')
  78. yield {
  79. 'url': url,
  80. 'timestamp': str(time.timestamp()),
  81. 'title': title or None,
  82. 'tags': tags or '',
  83. 'sources': [html_file.name],
  84. }
  85. def parse_json_export(json_file: IO[str]) -> Iterable[Link]:
  86. """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
  87. json_file.seek(0)
  88. links = json.load(json_file)
  89. json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
  90. for link in links:
  91. # example line
  92. # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
  93. if link:
  94. # Parse URL
  95. url = link.get('href') or link.get('url') or link.get('URL')
  96. if not url:
  97. raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
  98. # Parse the timestamp
  99. ts_str = str(datetime.now().timestamp())
  100. if link.get('timestamp'):
  101. # chrome/ff histories use a very precise timestamp
  102. ts_str = str(link['timestamp'] / 10000000)
  103. elif link.get('time'):
  104. ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
  105. elif link.get('created_at'):
  106. ts_str = str(json_date(link['created_at']).timestamp())
  107. elif link.get('created'):
  108. ts_str = str(json_date(link['created']).timestamp())
  109. elif link.get('date'):
  110. ts_str = str(json_date(link['date']).timestamp())
  111. elif link.get('bookmarked'):
  112. ts_str = str(json_date(link['bookmarked']).timestamp())
  113. elif link.get('saved'):
  114. ts_str = str(json_date(link['saved']).timestamp())
  115. # Parse the title
  116. title = None
  117. if link.get('title'):
  118. title = link['title'].strip() or None
  119. elif link.get('description'):
  120. title = link['description'].replace(' — Readability', '').strip() or None
  121. elif link.get('name'):
  122. title = link['name'].strip() or None
  123. yield {
  124. 'url': url,
  125. 'timestamp': ts_str,
  126. 'title': title,
  127. 'tags': link.get('tags') or '',
  128. 'sources': [json_file.name],
  129. }
  130. def parse_rss_export(rss_file: IO[str]) -> Iterable[Link]:
  131. """Parse RSS XML-format files into links"""
  132. rss_file.seek(0)
  133. items = rss_file.read().split('<item>')
  134. items = items[1:] if items else []
  135. for item in items:
  136. # example item:
  137. # <item>
  138. # <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
  139. # <category>Unread</category>
  140. # <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
  141. # <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
  142. # <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
  143. # </item>
  144. trailing_removed = item.split('</item>', 1)[0]
  145. leading_removed = trailing_removed.split('<item>', 1)[-1].strip()
  146. rows = leading_removed.split('\n')
  147. def get_row(key):
  148. return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
  149. url = str_between(get_row('link'), '<link>', '</link>')
  150. ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
  151. time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
  152. title = str_between(get_row('title'), '<![CDATA[', ']]').strip() or None
  153. yield {
  154. 'url': url,
  155. 'timestamp': str(time.timestamp()),
  156. 'title': title,
  157. 'tags': '',
  158. 'sources': [rss_file.name],
  159. }
  160. def parse_shaarli_rss_export(rss_file: IO[str]) -> Iterable[Link]:
  161. """Parse Shaarli-specific RSS XML-format files into links"""
  162. rss_file.seek(0)
  163. entries = rss_file.read().split('<entry>')[1:]
  164. for entry in entries:
  165. # example entry:
  166. # <entry>
  167. # <title>Aktuelle Trojaner-Welle: Emotet lauert in gefälschten Rechnungsmails | heise online</title>
  168. # <link href="https://www.heise.de/security/meldung/Aktuelle-Trojaner-Welle-Emotet-lauert-in-gefaelschten-Rechnungsmails-4291268.html" />
  169. # <id>https://demo.shaarli.org/?cEV4vw</id>
  170. # <published>2019-01-30T06:06:01+00:00</published>
  171. # <updated>2019-01-30T06:06:01+00:00</updated>
  172. # <content type="html" xml:lang="en"><![CDATA[<div class="markdown"><p>&#8212; <a href="https://demo.shaarli.org/?cEV4vw">Permalink</a></p></div>]]></content>
  173. # </entry>
  174. trailing_removed = entry.split('</entry>', 1)[0]
  175. leading_removed = trailing_removed.strip()
  176. rows = leading_removed.split('\n')
  177. def get_row(key):
  178. return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0]
  179. title = str_between(get_row('title'), '<title>', '</title>').strip()
  180. url = str_between(get_row('link'), '<link href="', '" />')
  181. ts_str = str_between(get_row('published'), '<published>', '</published>')
  182. time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
  183. yield {
  184. 'url': url,
  185. 'timestamp': str(time.timestamp()),
  186. 'title': title or None,
  187. 'tags': '',
  188. 'sources': [rss_file.name],
  189. }
  190. def parse_netscape_html_export(html_file: IO[str]) -> Iterable[Link]:
  191. """Parse netscape-format bookmarks export files (produced by all browsers)"""
  192. html_file.seek(0)
  193. pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE)
  194. for line in html_file:
  195. # example line
  196. # <DT><A HREF="https://example.com/?q=1+2" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://example.com/favicon.ico" ICON="data:image/png;base64,...">example bookmark title</A>
  197. match = pattern.search(line)
  198. if match:
  199. url = match.group(1)
  200. time = datetime.fromtimestamp(float(match.group(2)))
  201. yield {
  202. 'url': url,
  203. 'timestamp': str(time.timestamp()),
  204. 'title': match.group(3).strip() or None,
  205. 'tags': '',
  206. 'sources': [html_file.name],
  207. }
  208. def parse_pinboard_rss_export(rss_file: IO[str]) -> Iterable[Link]:
  209. """Parse Pinboard RSS feed files into links"""
  210. rss_file.seek(0)
  211. root = etree.parse(rss_file).getroot()
  212. items = root.findall("{http://purl.org/rss/1.0/}item")
  213. for item in items:
  214. url = item.find("{http://purl.org/rss/1.0/}link").text
  215. tags = item.find("{http://purl.org/dc/elements/1.1/}subject").text if item.find("{http://purl.org/dc/elements/1.1/}subject") else None
  216. title = item.find("{http://purl.org/rss/1.0/}title").text.strip() if item.find("{http://purl.org/rss/1.0/}title").text.strip() else None
  217. ts_str = item.find("{http://purl.org/dc/elements/1.1/}date").text if item.find("{http://purl.org/dc/elements/1.1/}date").text else None
  218. # Pinboard includes a colon in its date stamp timezone offsets, which
  219. # Python can't parse. Remove it:
  220. if ts_str and ts_str[-3:-2] == ":":
  221. ts_str = ts_str[:-3]+ts_str[-2:]
  222. if ts_str:
  223. time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
  224. else:
  225. time = datetime.now()
  226. yield {
  227. 'url': url,
  228. 'timestamp': str(time.timestamp()),
  229. 'title': title or None,
  230. 'tags': tags or '',
  231. 'sources': [rss_file.name],
  232. }
  233. def parse_medium_rss_export(rss_file: IO[str]) -> Iterable[Link]:
  234. """Parse Medium RSS feed files into links"""
  235. rss_file.seek(0)
  236. root = etree.parse(rss_file).getroot()
  237. items = root.find("channel").findall("item")
  238. for item in items:
  239. url = item.find("link").text
  240. title = item.find("title").text.strip()
  241. ts_str = item.find("pubDate").text
  242. time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z")
  243. yield {
  244. 'url': url,
  245. 'timestamp': str(time.timestamp()),
  246. 'title': title or None,
  247. 'tags': '',
  248. 'sources': [rss_file.name],
  249. }
  250. def parse_plain_text_export(text_file: IO[str]) -> Iterable[Link]:
  251. """Parse raw links from each line in a text file"""
  252. text_file.seek(0)
  253. for line in text_file.readlines():
  254. urls = re.findall(URL_REGEX, line) if line.strip() else ()
  255. for url in urls:
  256. yield {
  257. 'url': url,
  258. 'timestamp': str(datetime.now().timestamp()),
  259. 'title': None,
  260. 'tags': '',
  261. 'sources': [text_file.name],
  262. }