generic_html.py 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. __package__ = 'archivebox.parsers'
  2. import re
  3. from typing import IO, Iterable, Optional
  4. from datetime import datetime, timezone
  5. from ..index.schema import Link
  6. from ..util import (
  7. htmldecode,
  8. enforce_types,
  9. URL_REGEX,
  10. )
  11. from html.parser import HTMLParser
  12. from urllib.parse import urljoin
  13. class HrefParser(HTMLParser):
  14. def __init__(self):
  15. super().__init__()
  16. self.urls = []
  17. def handle_starttag(self, tag, attrs):
  18. if tag == "a":
  19. for attr, value in attrs:
  20. if attr == "href":
  21. self.urls.append(value)
  22. @enforce_types
  23. def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, **_kwargs) -> Iterable[Link]:
  24. """Parse Generic HTML for href tags and use only the url (support for title coming later)"""
  25. html_file.seek(0)
  26. for line in html_file:
  27. parser = HrefParser()
  28. # example line
  29. # <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
  30. parser.feed(line)
  31. for url in parser.urls:
  32. if root_url:
  33. # resolve relative urls /home.html -> https://example.com/home.html
  34. url = urljoin(root_url, url)
  35. for archivable_url in re.findall(URL_REGEX, url):
  36. yield Link(
  37. url=htmldecode(archivable_url),
  38. timestamp=str(datetime.now(timezone.utc).timestamp()),
  39. title=None,
  40. tags=None,
  41. sources=[html_file.name],
  42. )
  43. KEY = 'html'
  44. NAME = 'Generic HTML'
  45. PARSER = parse_generic_html_export