title.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
  1. __package__ = 'archivebox.extractors'
  2. import re
  3. from html.parser import HTMLParser
  4. from pathlib import Path
  5. from typing import Optional
  6. from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
  7. from ..util import (
  8. enforce_types,
  9. download_url,
  10. htmldecode,
  11. )
  12. from ..config import (
  13. TIMEOUT,
  14. CHECK_SSL_VALIDITY,
  15. SAVE_TITLE,
  16. CURL_BINARY,
  17. CURL_ARGS,
  18. CURL_VERSION,
  19. CURL_USER_AGENT,
  20. )
  21. from ..logging_util import TimedProgress
  22. HTML_TITLE_REGEX = re.compile(
  23. r'<title.*?>' # start matching text after <title> tag
  24. r'([^<>]+)', # get everything up to these symbols
  25. re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
  26. )
  27. class TitleParser(HTMLParser):
  28. def __init__(self, *args, **kwargs):
  29. super().__init__(*args, **kwargs)
  30. self.title_tag = ""
  31. self.title_og = ""
  32. self.inside_title_tag = False
  33. @property
  34. def title(self):
  35. return self.title_tag or self.title_og or None
  36. def handle_starttag(self, tag, attrs):
  37. if tag.lower() == "title" and not self.title_tag:
  38. self.inside_title_tag = True
  39. elif tag.lower() == "meta" and not self.title_og:
  40. attrs = dict(attrs)
  41. if attrs.get("property") == "og:title" and attrs.get("content"):
  42. self.title_og = attrs.get("content")
  43. def handle_data(self, data):
  44. if self.inside_title_tag and data:
  45. self.title_tag += data.strip()
  46. def handle_endtag(self, tag):
  47. if tag.lower() == "title":
  48. self.inside_title_tag = False
  49. @enforce_types
  50. def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
  51. """
  52. Try to find wget, singlefile and then dom files.
  53. If none is found, download the url again.
  54. """
  55. canonical = link.canonical_outputs()
  56. abs_path = path.absolute()
  57. # prefer chrome-generated DOM dump to singlefile as singlefile output often includes HUGE url(data:image/...base64) strings that crash parsers
  58. sources = [canonical["dom_path"], canonical["singlefile_path"], canonical["wget_path"]]
  59. document = None
  60. for source in sources:
  61. try:
  62. with open(abs_path / source, "r", encoding="utf-8") as f:
  63. document = f.read()
  64. break
  65. except (FileNotFoundError, TypeError):
  66. continue
  67. if document is None:
  68. return download_url(link.url, timeout=timeout)
  69. else:
  70. return document
  71. @enforce_types
  72. def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
  73. # if link already has valid title, skip it
  74. if not overwrite and link.title and not link.title.lower().startswith('http'):
  75. return False
  76. return SAVE_TITLE
  77. def extract_title_with_regex(html):
  78. match = re.search(HTML_TITLE_REGEX, html)
  79. output = htmldecode(match.group(1).strip()) if match else None
  80. return output
  81. @enforce_types
  82. def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  83. """try to guess the page's title from its content"""
  84. from core.models import Snapshot
  85. output: ArchiveOutput = None
  86. cmd = [
  87. CURL_BINARY,
  88. *CURL_ARGS,
  89. '--max-time', str(timeout),
  90. *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
  91. *([] if CHECK_SSL_VALIDITY else ['--insecure']),
  92. link.url,
  93. ]
  94. status = 'succeeded'
  95. timer = TimedProgress(timeout, prefix=' ')
  96. try:
  97. html = get_html(link, out_dir, timeout=timeout)
  98. try:
  99. # try using relatively strict html parser first
  100. parser = TitleParser()
  101. parser.feed(html)
  102. output = parser.title
  103. if output is None:
  104. raise
  105. except Exception:
  106. # fallback to regex that can handle broken/malformed html
  107. output = extract_title_with_regex(html)
  108. # if title is better than the one in the db, update db with new title
  109. if isinstance(output, str) and output:
  110. if not link.title or len(output) >= len(link.title):
  111. Snapshot.objects.filter(url=link.url,
  112. timestamp=link.timestamp)\
  113. .update(title=output)
  114. else:
  115. # if no content was returned, dont save a title (because it might be a temporary error)
  116. if not html:
  117. raise ArchiveError('Unable to detect page title')
  118. # output = html[:128] # use first bit of content as the title
  119. output = link.base_url # use the filename as the title (better UX)
  120. except Exception as err:
  121. status = 'failed'
  122. output = err
  123. finally:
  124. timer.end()
  125. return ArchiveResult(
  126. cmd=cmd,
  127. pwd=str(out_dir),
  128. cmd_version=CURL_VERSION,
  129. output=output,
  130. status=status,
  131. **timer.stats,
  132. )