medium_rss.py 1.1 KB

1234567891011121314151617181920212223242526272829303132333435
  1. __package__ = 'archivebox.parsers'
  2. from typing import IO, Iterable
  3. from datetime import datetime
  4. from xml.etree import ElementTree
  5. from ..index.schema import Link
  6. from ..util import (
  7. htmldecode,
  8. enforce_types,
  9. )
  10. @enforce_types
  11. def parse_medium_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
  12. """Parse Medium RSS feed files into links"""
  13. rss_file.seek(0)
  14. root = ElementTree.parse(rss_file).getroot()
  15. items = root.find("channel").findall("item") # type: ignore
  16. for item in items:
  17. url = item.find("link").text # type: ignore
  18. title = item.find("title").text.strip() # type: ignore
  19. ts_str = item.find("pubDate").text # type: ignore
  20. time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z") # type: ignore
  21. yield Link(
  22. url=htmldecode(url),
  23. timestamp=str(time.timestamp()),
  24. title=htmldecode(title) or None,
  25. tags=None,
  26. sources=[rss_file.name],
  27. )