__init__.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275
  1. # Copyright (c) 2013 Michael Dowling <[email protected]>
  2. # Copyright (c) 2017 Jared Dillard <[email protected]>
  3. #
  4. # Permission is hereby granted, free of charge, to any person obtaining a copy
  5. # of this software and associated documentation files (the "Software"), to deal
  6. # in the Software without restriction, including without limitation the rights
  7. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8. # copies of the Software, and to permit persons to whom the Software is
  9. # furnished to do so, subject to the following conditions:
  10. #
  11. # The above copyright notice and this permission notice shall be included in
  12. # all copies or substantial portions of the Software.
  13. import fnmatch
  14. import os
  15. import queue
  16. from datetime import datetime, timezone
  17. from multiprocessing import Manager
  18. from pathlib import Path
  19. from typing import Any, Dict, List, Optional
  20. from xml.etree import ElementTree
  21. from sphinx.application import Sphinx
  22. from sphinx.errors import ExtensionError
  23. from sphinx.util.logging import getLogger
  24. __version__ = "2.9.0"
  25. logger = getLogger(__name__)
  26. def setup(app: Sphinx) -> Dict[str, Any]:
  27. """
  28. Sphinx extension setup function.
  29. It adds config values and connects Sphinx events to the sitemap builder.
  30. :param app: The Sphinx Application instance
  31. :return: A dict of Sphinx extension options
  32. """
  33. app.add_config_value("site_url", default=None, rebuild="")
  34. app.add_config_value(
  35. "sitemap_url_scheme", default="{lang}{version}{link}", rebuild=""
  36. )
  37. app.add_config_value("sitemap_locales", default=[], rebuild="")
  38. app.add_config_value("sitemap_filename", default="sitemap.xml", rebuild="")
  39. app.add_config_value("sitemap_excludes", default=[], rebuild="")
  40. app.add_config_value("sitemap_show_lastmod", default=False, rebuild="")
  41. app.add_config_value("sitemap_indent", default=0, rebuild="")
  42. try:
  43. app.add_config_value("html_baseurl", default=None, rebuild="")
  44. except BaseException:
  45. pass
  46. # install sphinx_last_updated_by_git extension if it exists
  47. if app.config.sitemap_show_lastmod:
  48. try:
  49. app.setup_extension("sphinx_last_updated_by_git")
  50. except ExtensionError as e:
  51. logger.warning(
  52. f"{e}",
  53. type="sitemap",
  54. subtype="configuration",
  55. )
  56. app.config.sitemap_show_lastmod = False
  57. app.connect("builder-inited", record_builder_type)
  58. app.connect("html-page-context", add_html_link)
  59. app.connect("build-finished", create_sitemap)
  60. return {
  61. "parallel_read_safe": True,
  62. "parallel_write_safe": True,
  63. "version": __version__,
  64. }
  65. def get_locales(app: Sphinx) -> List[str]:
  66. """
  67. Get a list of locales from the extension config or automatically detect based
  68. on Sphinx Application config.
  69. :param app: The Sphinx Application instance
  70. :return: A list of locales
  71. """
  72. # Manually configured list of locales
  73. sitemap_locales: Optional[List[str]] = app.builder.config.sitemap_locales
  74. if sitemap_locales:
  75. # special value to add nothing -> use primary language only
  76. if sitemap_locales == [None]:
  77. return []
  78. # otherwise, add each locale
  79. return [locale for locale in sitemap_locales]
  80. # Or autodetect locales
  81. locales = []
  82. for locale_dir in app.builder.config.locale_dirs:
  83. locale_dir = os.path.join(app.confdir, locale_dir)
  84. if os.path.isdir(locale_dir):
  85. for locale in os.listdir(locale_dir):
  86. if os.path.isdir(os.path.join(locale_dir, locale)):
  87. locales.append(locale)
  88. return locales
  89. def record_builder_type(app: Sphinx):
  90. """
  91. Determine if the Sphinx Builder is an instance of DirectoryHTMLBuilder and store that in the
  92. application environment.
  93. :param app: The Sphinx Application instance
  94. """
  95. # builder isn't initialized in the setup so we do it here
  96. builder = getattr(app, "builder", None)
  97. if builder is None:
  98. return
  99. builder.env.is_directory_builder = type(builder).__name__ == "DirectoryHTMLBuilder"
  100. builder.env.app.sitemap_links = Manager().Queue()
  101. def is_excluded(sitemap_link: str, exclude_patterns: List[str]) -> bool:
  102. """
  103. Check if a sitemap link should be excluded based on wildcard patterns.
  104. :param sitemap_link: The sitemap link to check
  105. :param exclude_patterns: List of wildcard patterns to match against
  106. :return: True if the link matches any exclude pattern, False otherwise
  107. """
  108. return any(fnmatch.fnmatch(sitemap_link, pattern) for pattern in exclude_patterns)
  109. def hreflang_formatter(lang: str) -> str:
  110. """
  111. Format the supplied locale code into a string that is compatible with `hreflang`.
  112. See also:
  113. - https://en.wikipedia.org/wiki/Hreflang#Common_Mistakes
  114. - https://github.com/readthedocs/readthedocs.org/pull/5638
  115. :param lang: The locale string to format
  116. :return: The formatted locale string
  117. """
  118. if "_" in lang:
  119. return lang.replace("_", "-")
  120. return lang
  121. def add_html_link(app: Sphinx, pagename: str, templatename, context, doctree):
  122. """
  123. As each page is built, collect page names for the sitemap
  124. :param app: The Sphinx Application instance
  125. :param pagename: The current page being built
  126. """
  127. env = app.builder.env
  128. if app.builder.config.html_file_suffix is None:
  129. file_suffix = ".html"
  130. else:
  131. file_suffix = app.builder.config.html_file_suffix
  132. last_updated = None
  133. if app.builder.config.sitemap_show_lastmod and pagename in env.git_last_updated:
  134. timestamp, show_sourcelink = env.git_last_updated[pagename]
  135. # TODO verify dates
  136. # TODO handle untracked pages (add option to use current timestamp?)
  137. if timestamp:
  138. utc_date = datetime.fromtimestamp(int(timestamp), timezone.utc)
  139. last_updated = utc_date.strftime("%Y-%m-%dT%H:%M:%SZ")
  140. # Support DirectoryHTMLBuilder path structure
  141. # where generated links between pages omit the index.html
  142. if env.is_directory_builder: # type: ignore
  143. if pagename == "index":
  144. sitemap_link = ""
  145. elif pagename.endswith("/index"):
  146. sitemap_link = pagename[:-6] + "/"
  147. else:
  148. sitemap_link = pagename + "/"
  149. else:
  150. sitemap_link = pagename + file_suffix
  151. if not is_excluded(sitemap_link, app.builder.config.sitemap_excludes):
  152. env.app.sitemap_links.put((sitemap_link, last_updated)) # type: ignore
  153. def create_sitemap(app: Sphinx, exception):
  154. """
  155. Generates the sitemap.xml from the collected HTML page links.
  156. :param app: The Sphinx Application instance
  157. """
  158. site_url = app.builder.config.site_url or app.builder.config.html_baseurl
  159. if site_url:
  160. site_url.rstrip("/") + "/"
  161. else:
  162. logger.warning(
  163. "sphinx-sitemap: html_baseurl is required in conf.py." "Sitemap not built.",
  164. type="sitemap",
  165. subtype="configuration",
  166. )
  167. return
  168. if app.env.app.sitemap_links.empty(): # type: ignore
  169. logger.info(
  170. "sphinx-sitemap: No pages generated for %s" % app.config.sitemap_filename,
  171. type="sitemap",
  172. subtype="information",
  173. )
  174. return
  175. ElementTree.register_namespace("xhtml", "http://www.w3.org/1999/xhtml")
  176. root = ElementTree.Element(
  177. "urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
  178. )
  179. locales = get_locales(app)
  180. if app.builder.config.version:
  181. version = app.builder.config.version + "/"
  182. else:
  183. version = ""
  184. while True:
  185. try:
  186. link, last_updated = app.env.app.sitemap_links.get_nowait() # type: ignore
  187. except queue.Empty:
  188. break
  189. url = ElementTree.SubElement(root, "url")
  190. scheme = app.config.sitemap_url_scheme
  191. if app.builder.config.language:
  192. lang = app.builder.config.language + "/"
  193. else:
  194. lang = ""
  195. # add page url
  196. ElementTree.SubElement(url, "loc").text = site_url + scheme.format(
  197. lang=lang, version=version, link=link
  198. )
  199. # add page lastmode date if it exists
  200. if last_updated:
  201. ElementTree.SubElement(url, "lastmod").text = last_updated
  202. # add alternate language page urls
  203. for lang in locales:
  204. lang = lang + "/"
  205. ElementTree.SubElement(
  206. url,
  207. "{http://www.w3.org/1999/xhtml}link",
  208. rel="alternate",
  209. hreflang=hreflang_formatter(lang.rstrip("/")),
  210. href=site_url + scheme.format(lang=lang, version=version, link=link),
  211. )
  212. filename = Path(app.outdir) / app.config.sitemap_filename
  213. if isinstance(app.config.sitemap_indent, int) and app.config.sitemap_indent > 0:
  214. ElementTree.indent(root, space=app.config.sitemap_indent * " ")
  215. ElementTree.ElementTree(root).write(
  216. filename, xml_declaration=True, encoding="utf-8", method="xml"
  217. )
  218. logger.info(
  219. "sphinx-sitemap: %s was generated for URL %s in %s"
  220. % (app.config.sitemap_filename, site_url, filename),
  221. type="sitemap",
  222. subtype="information",
  223. )