parse.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332
  1. # coding: utf-8
  2. """
  3. Everything related to parsing links from bookmark services.
  4. For a list of supported services, see the README.md.
  5. For examples of supported files see examples/.
  6. Parsed link schema: {
  7. 'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
  8. 'domain': 'example.com',
  9. 'base_url': 'example.com/example/',
  10. 'timestamp': '15442123124234',
  11. 'tags': 'abc,def',
  12. 'title': 'Example.com Page Title',
  13. 'sources': ['ril_export.html', 'downloads/getpocket.com.txt'],
  14. }
  15. """
  16. import re
  17. import sys
  18. import json
  19. import urllib
  20. from collections import OrderedDict
  21. import xml.etree.ElementTree as etree
  22. from datetime import datetime
  23. from config import ANSI, SHOW_PROGRESS
  24. from util import (
  25. domain,
  26. base_url,
  27. str_between,
  28. get_link_type,
  29. fetch_page_title,
  30. URL_REGEX,
  31. )
  32. def get_parsers(file):
  33. """return all parsers that work on a given file, defaults to all of them"""
  34. return OrderedDict([
  35. ('Pocket HTML', parse_pocket_html_export),
  36. ('Pinboard JSON', parse_pinboard_json_export),
  37. ('Netscape HTML', parse_netscape_html_export),
  38. ('RSS', parse_rss_export),
  39. ('Pinboard RSS', parse_pinboard_rss_export),
  40. ('Shaarli RSS', parse_shaarli_rss_export),
  41. ('Medium RSS', parse_medium_rss_export),
  42. ('Plain Text', parse_plain_text_export),
  43. ])
  44. def parse_links(path):
  45. """parse a list of links dictionaries from a bookmark export file"""
  46. links = []
  47. with open(path, 'r', encoding='utf-8') as file:
  48. print('{green}[*] [{}] Parsing new links from output/sources/{} and fetching titles...{reset}'.format(
  49. datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  50. path.rsplit('/', 1)[-1],
  51. **ANSI,
  52. ))
  53. if SHOW_PROGRESS:
  54. sys.stdout.write(' ')
  55. for parser_name, parser_func in get_parsers(file).items():
  56. # otherwise try all parsers until one works
  57. try:
  58. links += list(parser_func(file))
  59. if links:
  60. break
  61. except Exception as err:
  62. # we try each parser one by one, each parser will throw exeption an exception if unsupported
  63. # so we accept the first one that
  64. # uncomment the following line to see why the parser was unsupported for each attempted format
  65. # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
  66. pass
  67. return links, parser_name
  68. def parse_pocket_html_export(html_file):
  69. """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
  70. html_file.seek(0)
  71. pattern = re.compile("^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE)
  72. for line in html_file:
  73. # example line
  74. # <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
  75. match = pattern.search(line)
  76. if match:
  77. fixed_url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url
  78. time = datetime.fromtimestamp(float(match.group(2)))
  79. info = {
  80. 'url': fixed_url,
  81. 'domain': domain(fixed_url),
  82. 'base_url': base_url(fixed_url),
  83. 'timestamp': str(time.timestamp()),
  84. 'tags': match.group(3),
  85. 'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') or fetch_page_title(fixed_url),
  86. 'sources': [html_file.name],
  87. }
  88. info['type'] = get_link_type(info)
  89. yield info
  90. def parse_pinboard_json_export(json_file):
  91. """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
  92. json_file.seek(0)
  93. json_content = json.load(json_file)
  94. for line in json_content:
  95. # example line
  96. # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
  97. if line:
  98. erg = line
  99. if erg.get('timestamp'):
  100. timestamp = str(erg['timestamp']/10000000) # chrome/ff histories use a very precise timestamp
  101. elif erg.get('time'):
  102. timestamp = str(datetime.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ').timestamp())
  103. elif erg.get('created_at'):
  104. timestamp = str(datetime.strptime(erg['created_at'], '%Y-%m-%dT%H:%M:%S%z').timestamp())
  105. else:
  106. timestamp = str(datetime.now().timestamp())
  107. if erg.get('href'):
  108. url = erg['href']
  109. else:
  110. url = erg['url']
  111. if erg.get('description'):
  112. title = (erg.get('description') or '').replace(' — Readability', '')
  113. else:
  114. title = erg['title'].strip()
  115. info = {
  116. 'url': url,
  117. 'domain': domain(url),
  118. 'base_url': base_url(url),
  119. 'timestamp': timestamp,
  120. 'tags': erg.get('tags') or '',
  121. 'title': title or fetch_page_title(url),
  122. 'sources': [json_file.name],
  123. }
  124. info['type'] = get_link_type(info)
  125. yield info
  126. def parse_rss_export(rss_file):
  127. """Parse RSS XML-format files into links"""
  128. rss_file.seek(0)
  129. items = rss_file.read().split('</item>\n<item>')
  130. for item in items:
  131. # example item:
  132. # <item>
  133. # <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
  134. # <category>Unread</category>
  135. # <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
  136. # <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
  137. # <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
  138. # </item>
  139. trailing_removed = item.split('</item>', 1)[0]
  140. leading_removed = trailing_removed.split('<item>', 1)[-1]
  141. rows = leading_removed.split('\n')
  142. def get_row(key):
  143. return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
  144. title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
  145. url = str_between(get_row('link'), '<link>', '</link>')
  146. ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
  147. time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
  148. info = {
  149. 'url': url,
  150. 'domain': domain(url),
  151. 'base_url': base_url(url),
  152. 'timestamp': str(time.timestamp()),
  153. 'tags': '',
  154. 'title': title or fetch_page_title(url),
  155. 'sources': [rss_file.name],
  156. }
  157. info['type'] = get_link_type(info)
  158. yield info
  159. def parse_shaarli_rss_export(rss_file):
  160. """Parse Shaarli-specific RSS XML-format files into links"""
  161. rss_file.seek(0)
  162. entries = rss_file.read().split('<entry>')[1:]
  163. for entry in entries:
  164. # example entry:
  165. # <entry>
  166. # <title>Aktuelle Trojaner-Welle: Emotet lauert in gefälschten Rechnungsmails | heise online</title>
  167. # <link href="https://www.heise.de/security/meldung/Aktuelle-Trojaner-Welle-Emotet-lauert-in-gefaelschten-Rechnungsmails-4291268.html" />
  168. # <id>https://demo.shaarli.org/?cEV4vw</id>
  169. # <published>2019-01-30T06:06:01+00:00</published>
  170. # <updated>2019-01-30T06:06:01+00:00</updated>
  171. # <content type="html" xml:lang="en"><![CDATA[<div class="markdown"><p>&#8212; <a href="https://demo.shaarli.org/?cEV4vw">Permalink</a></p></div>]]></content>
  172. # </entry>
  173. trailing_removed = entry.split('</entry>', 1)[0]
  174. leading_removed = trailing_removed.strip()
  175. rows = leading_removed.split('\n')
  176. def get_row(key):
  177. return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0]
  178. title = str_between(get_row('title'), '<title>', '</title>').strip()
  179. url = str_between(get_row('link'), '<link href="', '" />')
  180. ts_str = str_between(get_row('published'), '<published>', '</published>')
  181. time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
  182. info = {
  183. 'url': url,
  184. 'domain': domain(url),
  185. 'base_url': base_url(url),
  186. 'timestamp': str(time.timestamp()),
  187. 'tags': '',
  188. 'title': title or fetch_page_title(url),
  189. 'sources': [rss_file.name],
  190. }
  191. info['type'] = get_link_type(info)
  192. yield info
  193. def parse_netscape_html_export(html_file):
  194. """Parse netscape-format bookmarks export files (produced by all browsers)"""
  195. html_file.seek(0)
  196. pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE)
  197. for line in html_file:
  198. # example line
  199. # <DT><A HREF="https://example.com/?q=1+2" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://example.com/favicon.ico" ICON="data:image/png;base64,...">example bookmark title</A>
  200. match = pattern.search(line)
  201. if match:
  202. url = match.group(1)
  203. time = datetime.fromtimestamp(float(match.group(2)))
  204. info = {
  205. 'url': url,
  206. 'domain': domain(url),
  207. 'base_url': base_url(url),
  208. 'timestamp': str(time.timestamp()),
  209. 'tags': "",
  210. 'title': match.group(3).strip() or fetch_page_title(url),
  211. 'sources': [html_file.name],
  212. }
  213. info['type'] = get_link_type(info)
  214. yield info
  215. def parse_pinboard_rss_export(rss_file):
  216. """Parse Pinboard RSS feed files into links"""
  217. rss_file.seek(0)
  218. root = etree.parse(rss_file).getroot()
  219. items = root.findall("{http://purl.org/rss/1.0/}item")
  220. for item in items:
  221. url = item.find("{http://purl.org/rss/1.0/}link").text
  222. tags = item.find("{http://purl.org/dc/elements/1.1/}subject").text
  223. title = item.find("{http://purl.org/rss/1.0/}title").text.strip()
  224. ts_str = item.find("{http://purl.org/dc/elements/1.1/}date").text
  225. # = 🌈🌈🌈🌈
  226. # = 🌈🌈🌈🌈
  227. # = 🏆🏆🏆🏆
  228. # Pinboard includes a colon in its date stamp timezone offsets, which
  229. # Python can't parse. Remove it:
  230. if ":" == ts_str[-3:-2]:
  231. ts_str = ts_str[:-3]+ts_str[-2:]
  232. time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
  233. info = {
  234. 'url': url,
  235. 'domain': domain(url),
  236. 'base_url': base_url(url),
  237. 'timestamp': str(time.timestamp()),
  238. 'tags': tags,
  239. 'title': title or fetch_page_title(url),
  240. 'sources': [rss_file.name],
  241. }
  242. info['type'] = get_link_type(info)
  243. yield info
  244. def parse_medium_rss_export(rss_file):
  245. """Parse Medium RSS feed files into links"""
  246. rss_file.seek(0)
  247. root = etree.parse(rss_file).getroot()
  248. items = root.find("channel").findall("item")
  249. for item in items:
  250. # for child in item:
  251. # print(child.tag, child.text)
  252. url = item.find("link").text
  253. title = item.find("title").text.strip()
  254. ts_str = item.find("pubDate").text
  255. time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z")
  256. info = {
  257. 'url': url,
  258. 'domain': domain(url),
  259. 'base_url': base_url(url),
  260. 'timestamp': str(time.timestamp()),
  261. 'tags': '',
  262. 'title': title or fetch_page_title(url),
  263. 'sources': [rss_file.name],
  264. }
  265. info['type'] = get_link_type(info)
  266. yield info
  267. def parse_plain_text_export(text_file):
  268. """Parse raw links from each line in a text file"""
  269. text_file.seek(0)
  270. text_content = text_file.readlines()
  271. for line in text_content:
  272. if line:
  273. urls = re.findall(URL_REGEX, line)
  274. for url in urls:
  275. url = url.strip()
  276. info = {
  277. 'url': url,
  278. 'domain': domain(url),
  279. 'base_url': base_url(url),
  280. 'timestamp': str(datetime.now().timestamp()),
  281. 'tags': '',
  282. 'title': fetch_page_title(url),
  283. 'sources': [text_file.name],
  284. }
  285. info['type'] = get_link_type(info)
  286. yield info