generic_txt.py 1.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. __package__ = 'archivebox.parsers'
  2. __description__ = 'Plain Text'
  3. from typing import IO, Iterable
  4. from datetime import datetime, timezone
  5. from pathlib import Path
  6. from ..index.schema import Link
  7. from ..util import (
  8. htmldecode,
  9. enforce_types,
  10. find_all_urls,
  11. )
  12. @enforce_types
  13. def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
  14. """Parse links from a text file, ignoring other text"""
  15. text_file.seek(0)
  16. for line in text_file.readlines():
  17. if not line.strip():
  18. continue
  19. # if the line is a local file path that resolves, then we can archive it
  20. try:
  21. if Path(line).exists():
  22. yield Link(
  23. url=line,
  24. timestamp=str(datetime.now(timezone.utc).timestamp()),
  25. title=None,
  26. tags=None,
  27. sources=[text_file.name],
  28. )
  29. except (OSError, PermissionError):
  30. # nvm, not a valid path...
  31. pass
  32. # otherwise look for anything that looks like a URL in the line
  33. for url in find_all_urls(line):
  34. yield Link(
  35. url=htmldecode(url),
  36. timestamp=str(datetime.now(timezone.utc).timestamp()),
  37. title=None,
  38. tags=None,
  39. sources=[text_file.name],
  40. )
  41. KEY = 'txt'
  42. NAME = 'Generic TXT'
  43. PARSER = parse_generic_txt_export