2
0

medium_rss.py 1.1 KB

12345678910111213141516171819202122232425262728293031323334353637
  1. __package__ = 'archivebox.parsers'
  2. from typing import IO, Iterable
  3. from datetime import datetime
  4. from django.db.models import Model
  5. from xml.etree import ElementTree
  6. from ..util import (
  7. htmldecode,
  8. enforce_types,
  9. )
  10. @enforce_types
  11. def parse_medium_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Model]:
  12. """Parse Medium RSS feed files into links"""
  13. from core.models import Snapshot
  14. rss_file.seek(0)
  15. root = ElementTree.parse(rss_file).getroot()
  16. items = root.find("channel").findall("item") # type: ignore
  17. for item in items:
  18. url = item.find("link").text # type: ignore
  19. title = item.find("title").text.strip() # type: ignore
  20. ts_str = item.find("pubDate").text # type: ignore
  21. time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z") # type: ignore
  22. yield Snapshot(
  23. url=htmldecode(url),
  24. timestamp=str(time.timestamp()),
  25. title=htmldecode(title) or None,
  26. #tags=None,
  27. #sources=[rss_file.name],
  28. )