url_list.py 824 B

12345678910111213141516171819202122232425262728293031323334353637
  1. __package__ = 'archivebox.parsers'
  2. __description__ = 'URL list'
  3. import re
  4. from typing import IO, Iterable
  5. from datetime import datetime, timezone
  6. from ..index.schema import Link
  7. from archivebox.misc.util import (
  8. enforce_types,
  9. URL_REGEX,
  10. )
  11. @enforce_types
  12. def parse_url_list(text_file: IO[str], **_kwargs) -> Iterable[Link]:
  13. """Parse raw URLs from each line in a text file"""
  14. text_file.seek(0)
  15. for line in text_file.readlines():
  16. url = line.strip()
  17. if (not url) or not re.findall(URL_REGEX, url):
  18. continue
  19. yield Link(
  20. url=url,
  21. timestamp=str(datetime.now(timezone.utc).timestamp()),
  22. title=None,
  23. tags=None,
  24. sources=[text_file.name],
  25. )
  26. KEY = 'url_list'
  27. NAME = 'URL List'
  28. PARSER = parse_url_list