generic_txt.py 1.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. __package__ = 'archivebox.parsers'
  2. __description__ = 'Plain Text'
  3. from typing import IO, Iterable
  4. from datetime import datetime, timezone
  5. from ..index.schema import Link
  6. from archivebox.misc.util import (
  7. htmldecode,
  8. enforce_types,
  9. find_all_urls,
  10. )
  11. @enforce_types
  12. def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
  13. """Parse links from a text file, ignoring other text"""
  14. text_file.seek(0)
  15. for line in text_file.readlines():
  16. if not line.strip():
  17. continue
  18. # # if the line is a local file path that resolves, then we can archive it
  19. # if line.startswith('file://'):
  20. # try:
  21. # if Path(line).exists():
  22. # yield Link(
  23. # url=line,
  24. # timestamp=str(datetime.now(timezone.utc).timestamp()),
  25. # title=None,
  26. # tags=None,
  27. # sources=[text_file.name],
  28. # )
  29. # except (OSError, PermissionError):
  30. # # nvm, not a valid path...
  31. # pass
  32. # otherwise look for anything that looks like a URL in the line
  33. for url in find_all_urls(line):
  34. yield Link(
  35. url=htmldecode(url),
  36. timestamp=str(datetime.now(timezone.utc).timestamp()),
  37. title=None,
  38. tags=None,
  39. sources=[text_file.name],
  40. )
  41. KEY = 'txt'
  42. NAME = 'Generic TXT'
  43. PARSER = parse_generic_txt_export