title.py 2.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. __package__ = 'archivebox.extractors'
  2. import re
  3. from typing import Optional
  4. from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
  5. from ..util import (
  6. enforce_types,
  7. is_static_file,
  8. download_url,
  9. htmldecode,
  10. )
  11. from ..config import (
  12. TIMEOUT,
  13. CHECK_SSL_VALIDITY,
  14. SAVE_TITLE,
  15. CURL_BINARY,
  16. CURL_VERSION,
  17. CURL_USER_AGENT,
  18. )
  19. from ..cli.logging import TimedProgress
  20. HTML_TITLE_REGEX = re.compile(
  21. r'<title.*?>' # start matching text after <title> tag
  22. r'(.[^<>]+)', # get everything up to these symbols
  23. re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
  24. )
  25. @enforce_types
  26. def should_save_title(link: Link, out_dir: Optional[str]=None) -> bool:
  27. # if link already has valid title, skip it
  28. if link.title and not link.title.lower().startswith('http'):
  29. return False
  30. if is_static_file(link.url):
  31. return False
  32. return SAVE_TITLE
  33. @enforce_types
  34. def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  35. """try to guess the page's title from its content"""
  36. output: ArchiveOutput = None
  37. cmd = [
  38. CURL_BINARY,
  39. '--silent',
  40. '--max-time', str(timeout),
  41. '--location',
  42. *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
  43. *([] if CHECK_SSL_VALIDITY else ['--insecure']),
  44. link.url,
  45. '|',
  46. 'grep',
  47. '<title',
  48. ]
  49. status = 'succeeded'
  50. timer = TimedProgress(timeout, prefix=' ')
  51. try:
  52. html = download_url(link.url, timeout=timeout)
  53. match = re.search(HTML_TITLE_REGEX, html)
  54. output = htmldecode(match.group(1).strip()) if match else None
  55. if not output:
  56. raise ArchiveError('Unable to detect page title')
  57. except Exception as err:
  58. status = 'failed'
  59. output = err
  60. finally:
  61. timer.end()
  62. return ArchiveResult(
  63. cmd=cmd,
  64. pwd=out_dir,
  65. cmd_version=CURL_VERSION,
  66. output=output,
  67. status=status,
  68. **timer.stats,
  69. )