title.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
  1. __package__ = 'archivebox.extractors'
  2. import re
  3. from html.parser import HTMLParser
  4. from pathlib import Path
  5. from typing import Optional
  6. from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
  7. from ..util import (
  8. enforce_types,
  9. download_url,
  10. htmldecode,
  11. dedupe,
  12. )
  13. from ..config import (
  14. TIMEOUT,
  15. CHECK_SSL_VALIDITY,
  16. SAVE_TITLE,
  17. CURL_BINARY,
  18. CURL_ARGS,
  19. CURL_EXTRA_ARGS,
  20. CURL_VERSION,
  21. CURL_USER_AGENT,
  22. )
  23. from ..logging_util import TimedProgress
  24. HTML_TITLE_REGEX = re.compile(
  25. r'<title.*?>' # start matching text after <title> tag
  26. r'([^<>]+)', # get everything up to these symbols
  27. re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
  28. )
  29. class TitleParser(HTMLParser):
  30. def __init__(self, *args, **kwargs):
  31. super().__init__(*args, **kwargs)
  32. self.title_tag = ""
  33. self.title_og = ""
  34. self.inside_title_tag = False
  35. @property
  36. def title(self):
  37. return self.title_tag or self.title_og or None
  38. def handle_starttag(self, tag, attrs):
  39. if tag.lower() == "title" and not self.title_tag:
  40. self.inside_title_tag = True
  41. elif tag.lower() == "meta" and not self.title_og:
  42. attrs = dict(attrs)
  43. if attrs.get("property") == "og:title" and attrs.get("content"):
  44. self.title_og = attrs.get("content")
  45. def handle_data(self, data):
  46. if self.inside_title_tag and data:
  47. self.title_tag += data.strip()
  48. def handle_endtag(self, tag):
  49. if tag.lower() == "title":
  50. self.inside_title_tag = False
  51. @enforce_types
  52. def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
  53. """
  54. Try to find wget, singlefile and then dom files.
  55. If none is found, download the url again.
  56. """
  57. canonical = link.canonical_outputs()
  58. abs_path = path.absolute()
  59. # prefer chrome-generated DOM dump to singlefile as singlefile output often includes HUGE url(data:image/...base64) strings that crash parsers
  60. sources = [canonical["dom_path"], canonical["singlefile_path"], canonical["wget_path"]]
  61. document = None
  62. for source in sources:
  63. try:
  64. with open(abs_path / source, "r", encoding="utf-8") as f:
  65. document = f.read()
  66. break
  67. except (FileNotFoundError, TypeError, UnicodeDecodeError):
  68. continue
  69. if document is None:
  70. return download_url(link.url, timeout=timeout)
  71. else:
  72. return document
  73. def get_output_path():
  74. # TODO: actually save title to this file
  75. # (currently only saved in ArchiveResult.output as charfield value, not saved to filesystem)
  76. return 'title.json'
  77. @enforce_types
  78. def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
  79. # if link already has valid title, skip it
  80. if not overwrite and link.title and not link.title.lower().startswith('http'):
  81. return False
  82. return SAVE_TITLE
  83. def extract_title_with_regex(html):
  84. match = re.search(HTML_TITLE_REGEX, html)
  85. output = htmldecode(match.group(1).strip()) if match else None
  86. return output
  87. @enforce_types
  88. def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  89. """try to guess the page's title from its content"""
  90. from core.models import Snapshot
  91. output: ArchiveOutput = None
  92. # later options take precedence
  93. options = [
  94. *CURL_ARGS,
  95. *CURL_EXTRA_ARGS,
  96. '--max-time', str(timeout),
  97. *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
  98. *([] if CHECK_SSL_VALIDITY else ['--insecure']),
  99. ]
  100. cmd = [
  101. CURL_BINARY,
  102. *dedupe(options),
  103. link.url,
  104. ]
  105. status = 'succeeded'
  106. timer = TimedProgress(timeout, prefix=' ')
  107. try:
  108. html = get_html(link, out_dir, timeout=timeout)
  109. try:
  110. # try using relatively strict html parser first
  111. parser = TitleParser()
  112. parser.feed(html)
  113. output = parser.title
  114. if output is None:
  115. raise
  116. except Exception:
  117. # fallback to regex that can handle broken/malformed html
  118. output = extract_title_with_regex(html)
  119. # if title is better than the one in the db, update db with new title
  120. if isinstance(output, str) and output:
  121. if not link.title or len(output) >= len(link.title):
  122. Snapshot.objects.filter(url=link.url,
  123. timestamp=link.timestamp)\
  124. .update(title=output)
  125. else:
  126. # if no content was returned, dont save a title (because it might be a temporary error)
  127. if not html:
  128. raise ArchiveError('Unable to detect page title')
  129. # output = html[:128] # use first bit of content as the title
  130. output = link.base_url # use the filename as the title (better UX)
  131. except Exception as err:
  132. status = 'failed'
  133. output = err
  134. finally:
  135. timer.end()
  136. return ArchiveResult(
  137. cmd=cmd,
  138. pwd=str(out_dir),
  139. cmd_version=CURL_VERSION,
  140. output=output,
  141. status=status,
  142. **timer.stats,
  143. )