title.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. __package__ = 'archivebox.extractors'
  2. import re
  3. from html.parser import HTMLParser
  4. from pathlib import Path
  5. from typing import Optional
  6. from django.db.models import Model
  7. from ..index.schema import ArchiveResult, ArchiveOutput, ArchiveError
  8. from ..util import (
  9. enforce_types,
  10. download_url,
  11. htmldecode,
  12. )
  13. from ..config import (
  14. TIMEOUT,
  15. CHECK_SSL_VALIDITY,
  16. SAVE_TITLE,
  17. CURL_BINARY,
  18. CURL_ARGS,
  19. CURL_VERSION,
  20. CURL_USER_AGENT,
  21. )
  22. from ..logging_util import TimedProgress
  23. HTML_TITLE_REGEX = re.compile(
  24. r'<title.*?>' # start matching text after <title> tag
  25. r'(.[^<>]+)', # get everything up to these symbols
  26. re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
  27. )
  28. class TitleParser(HTMLParser):
  29. def __init__(self, *args, **kwargs):
  30. super().__init__(*args, **kwargs)
  31. self.title_tag = ""
  32. self.title_og = ""
  33. self.inside_title_tag = False
  34. @property
  35. def title(self):
  36. return self.title_tag or self.title_og or None
  37. def handle_starttag(self, tag, attrs):
  38. if tag.lower() == "title" and not self.title_tag:
  39. self.inside_title_tag = True
  40. elif tag.lower() == "meta" and not self.title_og:
  41. attrs = dict(attrs)
  42. if attrs.get("property") == "og:title" and attrs.get("content"):
  43. self.title_og = attrs.get("content")
  44. def handle_data(self, data):
  45. if self.inside_title_tag and data:
  46. self.title_tag += data.strip()
  47. def handle_endtag(self, tag):
  48. if tag.lower() == "title":
  49. self.inside_title_tag = False
  50. # output = '{title}'
  51. @enforce_types
  52. def should_save_title(snapshot: Model, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
  53. # if link already has valid title, skip it
  54. if not overwrite and snapshot.title and not snapshot.title.lower().startswith('http'):
  55. return False
  56. return SAVE_TITLE
  57. def extract_title_with_regex(html):
  58. match = re.search(HTML_TITLE_REGEX, html)
  59. output = htmldecode(match.group(1).strip()) if match else None
  60. return output
  61. @enforce_types
  62. def save_title(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
  63. """try to guess the page's title from its content"""
  64. from core.models import Snapshot
  65. output: ArchiveOutput = None
  66. cmd = [
  67. CURL_BINARY,
  68. *CURL_ARGS,
  69. '--max-time', str(timeout),
  70. *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
  71. *([] if CHECK_SSL_VALIDITY else ['--insecure']),
  72. snapshot.url,
  73. ]
  74. status = 'succeeded'
  75. timer = TimedProgress(timeout, prefix=' ')
  76. try:
  77. html = download_url(snapshot.url, timeout=timeout)
  78. try:
  79. # try using relatively strict html parser first
  80. parser = TitleParser()
  81. parser.feed(html)
  82. output = parser.title
  83. if output is None:
  84. raise
  85. except Exception:
  86. # fallback to regex that can handle broken/malformed html
  87. output = extract_title_with_regex(html)
  88. # if title is better than the one in the db, update db with new title
  89. if isinstance(output, str) and output:
  90. if not snapshot.title or len(output) >= len(snapshot.title):
  91. Snapshot.objects.filter(url=snapshot.url,
  92. timestamp=snapshot.timestamp)\
  93. .update(title=output)
  94. snapshot.title = output
  95. else:
  96. # if no content was returned, dont save a title (because it might be a temporary error)
  97. if not html:
  98. raise ArchiveError('Unable to detect page title')
  99. # output = html[:128] # use first bit of content as the title
  100. output = link.base_url # use the filename as the title (better UX)
  101. except Exception as err:
  102. status = 'failed'
  103. output = err
  104. finally:
  105. timer.end()
  106. return ArchiveResult(
  107. cmd=cmd,
  108. pwd=str(out_dir),
  109. cmd_version=CURL_VERSION,
  110. output=output,
  111. status=status,
  112. **timer.stats,
  113. )