title.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. __package__ = 'archivebox.extractors'
  2. import re
  3. from html.parser import HTMLParser
  4. from pathlib import Path
  5. from typing import Optional
  6. from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
  7. from ..util import (
  8. enforce_types,
  9. download_url,
  10. htmldecode,
  11. dedupe,
  12. )
  13. from ..config import (
  14. TIMEOUT,
  15. CHECK_SSL_VALIDITY,
  16. SAVE_TITLE,
  17. CURL_BINARY,
  18. CURL_ARGS,
  19. CURL_EXTRA_ARGS,
  20. CURL_VERSION,
  21. CURL_USER_AGENT,
  22. )
  23. from ..logging_util import TimedProgress
  24. HTML_TITLE_REGEX = re.compile(
  25. r'<title.*?>' # start matching text after <title> tag
  26. r'([^<>]+)', # get everything up to these symbols
  27. re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
  28. )
  29. class TitleParser(HTMLParser):
  30. def __init__(self, *args, **kwargs):
  31. super().__init__(*args, **kwargs)
  32. self.title_tag = ""
  33. self.title_og = ""
  34. self.inside_title_tag = False
  35. @property
  36. def title(self):
  37. return self.title_tag or self.title_og or None
  38. def handle_starttag(self, tag, attrs):
  39. if tag.lower() == "title" and not self.title_tag:
  40. self.inside_title_tag = True
  41. elif tag.lower() == "meta" and not self.title_og:
  42. attrs = dict(attrs)
  43. if attrs.get("property") == "og:title" and attrs.get("content"):
  44. self.title_og = attrs.get("content")
  45. def handle_data(self, data):
  46. if self.inside_title_tag and data:
  47. self.title_tag += data.strip()
  48. def handle_endtag(self, tag):
  49. if tag.lower() == "title":
  50. self.inside_title_tag = False
  51. @enforce_types
  52. def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
  53. """
  54. Try to find wget, singlefile and then dom files.
  55. If none is found, download the url again.
  56. """
  57. canonical = link.canonical_outputs()
  58. abs_path = path.absolute()
  59. # prefer chrome-generated DOM dump to singlefile as singlefile output often includes HUGE url(data:image/...base64) strings that crash parsers
  60. sources = [canonical["dom_path"], canonical["singlefile_path"], canonical["wget_path"]]
  61. document = None
  62. for source in sources:
  63. try:
  64. with open(abs_path / source, "r", encoding="utf-8") as f:
  65. document = f.read()
  66. break
  67. except (FileNotFoundError, TypeError, UnicodeDecodeError):
  68. continue
  69. if document is None:
  70. return download_url(link.url, timeout=timeout)
  71. else:
  72. return document
  73. @enforce_types
  74. def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
  75. # if link already has valid title, skip it
  76. if not overwrite and link.title and not link.title.lower().startswith('http'):
  77. return False
  78. return SAVE_TITLE
  79. def extract_title_with_regex(html):
  80. match = re.search(HTML_TITLE_REGEX, html)
  81. output = htmldecode(match.group(1).strip()) if match else None
  82. return output
  83. @enforce_types
  84. def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  85. """try to guess the page's title from its content"""
  86. from core.models import Snapshot
  87. output: ArchiveOutput = None
  88. # later options take precedence
  89. options = [
  90. *CURL_ARGS,
  91. *CURL_EXTRA_ARGS,
  92. '--max-time', str(timeout),
  93. *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
  94. *([] if CHECK_SSL_VALIDITY else ['--insecure']),
  95. ]
  96. cmd = [
  97. CURL_BINARY,
  98. *dedupe(options),
  99. link.url,
  100. ]
  101. status = 'succeeded'
  102. timer = TimedProgress(timeout, prefix=' ')
  103. try:
  104. html = get_html(link, out_dir, timeout=timeout)
  105. try:
  106. # try using relatively strict html parser first
  107. parser = TitleParser()
  108. parser.feed(html)
  109. output = parser.title
  110. if output is None:
  111. raise
  112. except Exception:
  113. # fallback to regex that can handle broken/malformed html
  114. output = extract_title_with_regex(html)
  115. # if title is better than the one in the db, update db with new title
  116. if isinstance(output, str) and output:
  117. if not link.title or len(output) >= len(link.title):
  118. Snapshot.objects.filter(url=link.url,
  119. timestamp=link.timestamp)\
  120. .update(title=output)
  121. else:
  122. # if no content was returned, dont save a title (because it might be a temporary error)
  123. if not html:
  124. raise ArchiveError('Unable to detect page title')
  125. # output = html[:128] # use first bit of content as the title
  126. output = link.base_url # use the filename as the title (better UX)
  127. except Exception as err:
  128. status = 'failed'
  129. output = err
  130. finally:
  131. timer.end()
  132. return ArchiveResult(
  133. cmd=cmd,
  134. pwd=str(out_dir),
  135. cmd_version=CURL_VERSION,
  136. output=output,
  137. status=status,
  138. **timer.stats,
  139. )