on_Snapshot__56_readability.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199
  1. #!/usr/bin/env python3
  2. """
  3. Extract article content using Mozilla's Readability.
  4. Usage: on_Snapshot__readability.py --url=<url> --snapshot-id=<uuid>
  5. Output: Creates readability/ directory with content.html, content.txt, article.json
  6. Environment variables:
  7. READABILITY_BINARY: Path to readability-extractor binary
  8. READABILITY_TIMEOUT: Timeout in seconds (default: 60)
  9. READABILITY_ARGS: Default Readability arguments (JSON array)
  10. READABILITY_ARGS_EXTRA: Extra arguments to append (JSON array)
  11. TIMEOUT: Fallback timeout
  12. Note: Requires readability-extractor from https://github.com/ArchiveBox/readability-extractor
  13. This extractor looks for HTML source from other extractors (wget, singlefile, dom)
  14. """
  15. import json
  16. import os
  17. import subprocess
  18. import sys
  19. import tempfile
  20. from pathlib import Path
  21. from urllib.parse import urlparse
  22. import rich_click as click
  23. # Extractor metadata
  24. PLUGIN_NAME = 'readability'
  25. BIN_NAME = 'readability-extractor'
  26. BIN_PROVIDERS = 'npm,env'
  27. OUTPUT_DIR = '.'
  28. OUTPUT_FILE = 'content.html'
  29. def get_env(name: str, default: str = '') -> str:
  30. return os.environ.get(name, default).strip()
  31. def get_env_int(name: str, default: int = 0) -> int:
  32. try:
  33. return int(get_env(name, str(default)))
  34. except ValueError:
  35. return default
  36. def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
  37. """Parse a JSON array from environment variable."""
  38. val = get_env(name, '')
  39. if not val:
  40. return default if default is not None else []
  41. try:
  42. result = json.loads(val)
  43. if isinstance(result, list):
  44. return [str(item) for item in result]
  45. return default if default is not None else []
  46. except json.JSONDecodeError:
  47. return default if default is not None else []
  48. def find_html_source() -> str | None:
  49. """Find HTML content from other extractors in the snapshot directory."""
  50. # Hooks run in snapshot_dir, sibling extractor outputs are in subdirectories
  51. search_patterns = [
  52. 'singlefile/singlefile.html',
  53. '*_singlefile/singlefile.html',
  54. 'singlefile/*.html',
  55. '*_singlefile/*.html',
  56. 'dom/output.html',
  57. '*_dom/output.html',
  58. 'dom/*.html',
  59. '*_dom/*.html',
  60. 'wget/**/*.html',
  61. '*_wget/**/*.html',
  62. 'wget/**/*.htm',
  63. '*_wget/**/*.htm',
  64. ]
  65. for base in (Path.cwd(), Path.cwd().parent):
  66. for pattern in search_patterns:
  67. matches = list(base.glob(pattern))
  68. for match in matches:
  69. if match.is_file() and match.stat().st_size > 0:
  70. return str(match)
  71. return None
  72. def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
  73. """
  74. Extract article using Readability.
  75. Returns: (success, output_path, error_message)
  76. """
  77. timeout = get_env_int('READABILITY_TIMEOUT') or get_env_int('TIMEOUT', 60)
  78. readability_args = get_env_array('READABILITY_ARGS', [])
  79. readability_args_extra = get_env_array('READABILITY_ARGS_EXTRA', [])
  80. # Find HTML source
  81. html_source = find_html_source()
  82. if not html_source:
  83. return False, None, 'No HTML source found (run singlefile, dom, or wget first)'
  84. # Output directory is current directory (hook already runs in output dir)
  85. output_dir = Path(OUTPUT_DIR)
  86. try:
  87. # Run readability-extractor (outputs JSON by default)
  88. cmd = [binary, *readability_args, *readability_args_extra, html_source]
  89. result = subprocess.run(cmd, stdout=subprocess.PIPE, timeout=timeout, text=True)
  90. if result.stdout:
  91. sys.stderr.write(result.stdout)
  92. sys.stderr.flush()
  93. if result.returncode != 0:
  94. return False, None, f'readability-extractor failed (exit={result.returncode})'
  95. # Parse JSON output
  96. try:
  97. result_json = json.loads(result.stdout)
  98. except json.JSONDecodeError:
  99. return False, None, 'readability-extractor returned invalid JSON'
  100. # Extract and save content
  101. # readability-extractor uses camelCase field names (textContent, content)
  102. text_content = result_json.pop('textContent', result_json.pop('text-content', ''))
  103. html_content = result_json.pop('content', result_json.pop('html-content', ''))
  104. if not text_content and not html_content:
  105. return False, None, 'No content extracted'
  106. (output_dir / OUTPUT_FILE).write_text(html_content, encoding='utf-8')
  107. (output_dir / 'content.txt').write_text(text_content, encoding='utf-8')
  108. (output_dir / 'article.json').write_text(json.dumps(result_json, indent=2), encoding='utf-8')
  109. # Link images/ to responses capture (if available)
  110. try:
  111. hostname = urlparse(url).hostname or ''
  112. if hostname:
  113. responses_images = (output_dir / '..' / 'responses' / 'image' / hostname / 'images').resolve()
  114. link_path = output_dir / 'images'
  115. if responses_images.exists() and responses_images.is_dir():
  116. if link_path.exists() or link_path.is_symlink():
  117. if link_path.is_symlink() or link_path.is_file():
  118. link_path.unlink()
  119. else:
  120. responses_images = None
  121. if responses_images:
  122. rel_target = os.path.relpath(str(responses_images), str(output_dir))
  123. link_path.symlink_to(rel_target)
  124. except Exception:
  125. pass
  126. return True, OUTPUT_FILE, ''
  127. except subprocess.TimeoutExpired:
  128. return False, None, f'Timed out after {timeout} seconds'
  129. except Exception as e:
  130. return False, None, f'{type(e).__name__}: {e}'
  131. @click.command()
  132. @click.option('--url', required=True, help='URL to extract article from')
  133. @click.option('--snapshot-id', required=True, help='Snapshot UUID')
  134. def main(url: str, snapshot_id: str):
  135. """Extract article content using Mozilla's Readability."""
  136. try:
  137. # Get binary from environment
  138. binary = get_env('READABILITY_BINARY', 'readability-extractor')
  139. # Run extraction
  140. success, output, error = extract_readability(url, binary)
  141. if success:
  142. # Success - emit ArchiveResult
  143. result = {
  144. 'type': 'ArchiveResult',
  145. 'status': 'succeeded',
  146. 'output_str': output or ''
  147. }
  148. print(json.dumps(result))
  149. sys.exit(0)
  150. else:
  151. # Transient error - emit NO JSONL
  152. print(f'ERROR: {error}', file=sys.stderr)
  153. sys.exit(1)
  154. except Exception as e:
  155. # Transient error - emit NO JSONL
  156. print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
  157. sys.exit(1)
  158. if __name__ == '__main__':
  159. main()