htmltotext.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
  1. __package__ = 'archivebox.extractors'
  2. from html.parser import HTMLParser
  3. import io
  4. from pathlib import Path
  5. from typing import Optional
  6. from ..config import (
  7. SAVE_HTMLTOTEXT,
  8. TIMEOUT,
  9. VERSION,
  10. )
  11. from ..index.schema import Link, ArchiveResult, ArchiveError
  12. from ..logging_util import TimedProgress
  13. from ..system import atomic_write
  14. from ..util import (
  15. enforce_types,
  16. is_static_file,
  17. )
  18. from .title import get_html
  19. class HTMLTextExtractor(HTMLParser):
  20. TEXT_ATTRS = [
  21. "alt", "cite", "href", "label",
  22. "list", "placeholder", "title", "value"
  23. ]
  24. NOTEXT_TAGS = ["script", "style", "template"]
  25. NOTEXT_HREF = ["data:", "javascript:", "#"]
  26. def __init__(self):
  27. super().__init__()
  28. self.output = io.StringIO()
  29. self._tag_stack = []
  30. def _is_text_attr(self, name, value):
  31. if not isinstance(value, str):
  32. return False
  33. if name == "href" and any(map(lambda p: value.startswith(p), self.NOTEXT_HREF)):
  34. return False
  35. if name in self.TEXT_ATTRS:
  36. return True
  37. return False
  38. def _parent_tag(self):
  39. try:
  40. return self._tag_stack[-1]
  41. except IndexError:
  42. return None
  43. def _in_notext_tag(self):
  44. return any([t in self._tag_stack for t in self.NOTEXT_TAGS])
  45. def handle_starttag(self, tag, attrs):
  46. self._tag_stack.append(tag)
  47. # Don't write out attribute values if any ancestor
  48. # is in NOTEXT_TAGS
  49. if self._in_notext_tag():
  50. return
  51. for name, value in attrs:
  52. if self._is_text_attr(name, value):
  53. self.output.write(f"({value.strip()}) ")
  54. def handle_endtag(self, tag):
  55. orig_stack = self._tag_stack.copy()
  56. try:
  57. # Keep popping tags until we find the nearest
  58. # ancestor matching this end tag
  59. while tag != self._tag_stack.pop():
  60. pass
  61. # Write a space after every tag, to ensure that tokens
  62. # in tag text aren't concatenated. This may result in
  63. # excess spaces, which should be ignored by search tokenizers.
  64. if not self._in_notext_tag() and tag not in self.NOTEXT_TAGS:
  65. self.output.write(" ")
  66. except IndexError:
  67. # Got to the top of the stack, but somehow missed
  68. # this end tag -- maybe malformed markup -- restore the
  69. # stack
  70. self._tag_stack = orig_stack
  71. def handle_data(self, data):
  72. # Don't output text data if any ancestor is in NOTEXT_TAGS
  73. if self._in_notext_tag():
  74. return
  75. data = data.lstrip()
  76. len_before_rstrip = len(data)
  77. data = data.rstrip()
  78. spaces_rstripped = len_before_rstrip - len(data)
  79. if data:
  80. self.output.write(data)
  81. if spaces_rstripped:
  82. # Add back a single space if 1 or more
  83. # whitespace characters were stripped
  84. self.output.write(' ')
  85. def __str__(self):
  86. return self.output.getvalue()
  87. @enforce_types
  88. def should_save_htmltotext(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
  89. if is_static_file(link.url):
  90. return False
  91. out_dir = out_dir or Path(link.link_dir)
  92. if not overwrite and (out_dir / 'htmltotext.txt').exists():
  93. return False
  94. return SAVE_HTMLTOTEXT
  95. @enforce_types
  96. def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  97. """extract search-indexing-friendly text from an HTML document"""
  98. out_dir = Path(out_dir or link.link_dir)
  99. output = "htmltotext.txt"
  100. timer = TimedProgress(timeout, prefix=' ')
  101. extracted_text = None
  102. try:
  103. extractor = HTMLTextExtractor()
  104. document = get_html(link, out_dir)
  105. if not document:
  106. raise ArchiveError('htmltotext could not find HTML to parse for article text')
  107. extractor.feed(document)
  108. extractor.close()
  109. extracted_text = str(extractor)
  110. atomic_write(str(out_dir / output), extracted_text)
  111. except (Exception, OSError) as err:
  112. status = 'failed'
  113. output = err
  114. cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
  115. finally:
  116. timer.end()
  117. return ArchiveResult(
  118. cmd=cmd,
  119. pwd=str(out_dir),
  120. cmd_version=VERSION,
  121. output=output,
  122. status=status,
  123. index_texts=[extracted_text] if extracted_text else [],
  124. **timer.stats,
  125. )