_title_parser.py 829 B

12345678910111213141516171819202122232425262728293031323334
  1. from __future__ import annotations
  2. from html.parser import HTMLParser
  3. def get_title(title: str) -> tuple[str, str]:
  4. htp = HTMLTextParser()
  5. htp.feed(title)
  6. htp.close()
  7. return htp.text, htp.text_outside_tags
  8. class HTMLTextParser(HTMLParser):
  9. """Parse HTML into text."""
  10. def __init__(self) -> None:
  11. super().__init__()
  12. # All text found
  13. self.text = ''
  14. # Only text outside of html tags
  15. self.text_outside_tags = ''
  16. self.level = 0
  17. def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
  18. self.level += 1
  19. def handle_endtag(self, tag: str) -> None:
  20. self.level -= 1
  21. def handle_data(self, data: str) -> None:
  22. self.text += data
  23. if self.level == 0:
  24. self.text_outside_tags += data