generic_txt.py 2.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. __package__ = 'archivebox.parsers'
  2. __description__ = 'Plain Text'
  3. import re
  4. from typing import IO, Iterable
  5. from datetime import datetime, timezone
  6. from pathlib import Path
  7. from ..index.schema import Link
  8. from ..util import (
  9. htmldecode,
  10. enforce_types,
  11. URL_REGEX
  12. )
  13. @enforce_types
  14. def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
  15. """Parse links from a text file, ignoring other text"""
  16. text_file.seek(0)
  17. for line in text_file.readlines():
  18. if not line.strip():
  19. continue
  20. # if the line is a local file path that resolves, then we can archive it
  21. try:
  22. if Path(line).exists():
  23. yield Link(
  24. url=line,
  25. timestamp=str(datetime.now(timezone.utc).timestamp()),
  26. title=None,
  27. tags=None,
  28. sources=[text_file.name],
  29. )
  30. except (OSError, PermissionError):
  31. # nvm, not a valid path...
  32. pass
  33. # otherwise look for anything that looks like a URL in the line
  34. for url in re.findall(URL_REGEX, line):
  35. yield Link(
  36. url=htmldecode(url),
  37. timestamp=str(datetime.now(timezone.utc).timestamp()),
  38. title=None,
  39. tags=None,
  40. sources=[text_file.name],
  41. )
  42. # look inside the URL for any sub-urls, e.g. for archive.org links
  43. # https://web.archive.org/web/20200531203453/https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
  44. # -> https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
  45. for sub_url in re.findall(URL_REGEX, line[1:]):
  46. yield Link(
  47. url=htmldecode(sub_url),
  48. timestamp=str(datetime.now(timezone.utc).timestamp()),
  49. title=None,
  50. tags=None,
  51. sources=[text_file.name],
  52. )
  53. KEY = 'txt'
  54. NAME = 'Generic TXT'
  55. PARSER = parse_generic_txt_export