pocket_html.py 1.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. __package__ = 'archivebox.parsers'
  2. import re
  3. from typing import IO, Iterable
  4. from datetime import datetime
  5. from ..index.schema import Link
  6. from archivebox.misc.util import (
  7. htmldecode,
  8. enforce_types,
  9. )
  10. @enforce_types
  11. def parse_pocket_html_export(html_file: IO[str], **_kwargs) -> Iterable[Link]:
  12. """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
  13. html_file.seek(0)
  14. pattern = re.compile("^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE)
  15. for line in html_file:
  16. # example line
  17. # <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
  18. match = pattern.search(line)
  19. if match:
  20. url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url
  21. time = datetime.fromtimestamp(float(match.group(2)))
  22. tags = match.group(3)
  23. title = match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '')
  24. yield Link(
  25. url=htmldecode(url),
  26. timestamp=str(time.timestamp()),
  27. title=htmldecode(title) or None,
  28. tags=tags or '',
  29. sources=[html_file.name],
  30. )
  31. KEY = 'pocket_html'
  32. NAME = 'Pocket HTML'
  33. PARSER = parse_pocket_html_export