on_Snapshot__58_htmltotext.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. #!/usr/bin/env python3
  2. """
  3. Convert HTML to plain text for search indexing.
  4. This extractor reads HTML from other extractors (wget, singlefile, dom)
  5. and converts it to plain text for full-text search.
  6. Usage: on_Snapshot__htmltotext.py --url=<url> --snapshot-id=<uuid>
  7. Output: Writes htmltotext.txt to $PWD
  8. Environment variables:
  9. TIMEOUT: Timeout in seconds (not used, but kept for consistency)
  10. Note: This extractor does not require any external binaries.
  11. It uses Python's built-in html.parser module.
  12. """
  13. import json
  14. import os
  15. import re
  16. import sys
  17. from html.parser import HTMLParser
  18. from pathlib import Path
  19. import rich_click as click
  20. # Extractor metadata
  21. PLUGIN_NAME = 'htmltotext'
  22. OUTPUT_DIR = '.'
  23. OUTPUT_FILE = 'htmltotext.txt'
  24. class HTMLTextExtractor(HTMLParser):
  25. """Extract text content from HTML, ignoring scripts/styles."""
  26. def __init__(self):
  27. super().__init__()
  28. self.result = []
  29. self.skip_tags = {'script', 'style', 'head', 'meta', 'link', 'noscript'}
  30. self.current_tag = None
  31. def handle_starttag(self, tag, attrs):
  32. self.current_tag = tag.lower()
  33. def handle_endtag(self, tag):
  34. self.current_tag = None
  35. def handle_data(self, data):
  36. if self.current_tag not in self.skip_tags:
  37. text = data.strip()
  38. if text:
  39. self.result.append(text)
  40. def get_text(self) -> str:
  41. return ' '.join(self.result)
  42. def html_to_text(html: str) -> str:
  43. """Convert HTML to plain text."""
  44. parser = HTMLTextExtractor()
  45. try:
  46. parser.feed(html)
  47. return parser.get_text()
  48. except Exception:
  49. # Fallback: strip HTML tags with regex
  50. text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
  51. text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)
  52. text = re.sub(r'<[^>]+>', ' ', text)
  53. text = re.sub(r'\s+', ' ', text)
  54. return text.strip()
  55. def find_html_source() -> str | None:
  56. """Find HTML content from other extractors in the snapshot directory."""
  57. # Hooks run in snapshot_dir, sibling extractor outputs are in subdirectories
  58. search_patterns = [
  59. 'singlefile/singlefile.html',
  60. '*_singlefile/singlefile.html',
  61. 'singlefile/*.html',
  62. '*_singlefile/*.html',
  63. 'dom/output.html',
  64. '*_dom/output.html',
  65. 'dom/*.html',
  66. '*_dom/*.html',
  67. 'wget/**/*.html',
  68. '*_wget/**/*.html',
  69. 'wget/**/*.htm',
  70. '*_wget/**/*.htm',
  71. ]
  72. for base in (Path.cwd(), Path.cwd().parent):
  73. for pattern in search_patterns:
  74. matches = list(base.glob(pattern))
  75. for match in matches:
  76. if match.is_file() and match.stat().st_size > 0:
  77. try:
  78. return match.read_text(errors='ignore')
  79. except Exception:
  80. continue
  81. return None
  82. def extract_htmltotext(url: str) -> tuple[bool, str | None, str]:
  83. """
  84. Extract plain text from HTML sources.
  85. Returns: (success, output_path, error_message)
  86. """
  87. # Find HTML source from other extractors
  88. html_content = find_html_source()
  89. if not html_content:
  90. return False, None, 'No HTML source found (run singlefile, dom, or wget first)'
  91. # Convert HTML to text
  92. text = html_to_text(html_content)
  93. if not text or len(text) < 10:
  94. return False, None, 'No meaningful text extracted from HTML'
  95. # Output directory is current directory (hook already runs in output dir)
  96. output_dir = Path(OUTPUT_DIR)
  97. output_path = output_dir / OUTPUT_FILE
  98. output_path.write_text(text, encoding='utf-8')
  99. return True, str(output_path), ''
  100. @click.command()
  101. @click.option('--url', required=True, help='URL that was archived')
  102. @click.option('--snapshot-id', required=True, help='Snapshot UUID')
  103. def main(url: str, snapshot_id: str):
  104. """Convert HTML to plain text for search indexing."""
  105. try:
  106. # Run extraction
  107. success, output, error = extract_htmltotext(url)
  108. if success:
  109. # Success - emit ArchiveResult
  110. result = {
  111. 'type': 'ArchiveResult',
  112. 'status': 'succeeded',
  113. 'output_str': output or ''
  114. }
  115. print(json.dumps(result))
  116. sys.exit(0)
  117. else:
  118. # Transient error - emit NO JSONL
  119. print(f'ERROR: {error}', file=sys.stderr)
  120. sys.exit(1)
  121. except Exception as e:
  122. # Transient error - emit NO JSONL
  123. print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
  124. sys.exit(1)
  125. if __name__ == '__main__':
  126. main()