netscape_html.py 1.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. __package__ = 'archivebox.parsers'
  2. import re
  3. from typing import IO, Iterable
  4. from datetime import datetime
  5. from ..index.schema import Link
  6. from ..util import (
  7. htmldecode,
  8. enforce_types,
  9. )
  10. @enforce_types
  11. def parse_netscape_html_export(html_file: IO[str], **_kwargs) -> Iterable[Link]:
  12. """Parse netscape-format bookmarks export files (produced by all browsers)"""
  13. html_file.seek(0)
  14. pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE)
  15. for line in html_file:
  16. # example line
  17. # <DT><A HREF="https://example.com/?q=1+2" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://example.com/favicon.ico" ICON="data:image/png;base64,...">example bookmark title</A>
  18. match = pattern.search(line)
  19. if match:
  20. url = match.group(1)
  21. time = datetime.fromtimestamp(float(match.group(2)))
  22. title = match.group(3).strip()
  23. yield Link(
  24. url=htmldecode(url),
  25. timestamp=str(time.timestamp()),
  26. title=htmldecode(title) or None,
  27. tags=None,
  28. sources=[html_file.name],
  29. )
  30. KEY = 'netscape_html'
  31. NAME = 'Netscape HTML'
  32. PARSER = parse_netscape_html_export