sphinx_last_updated_by_git.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. """Get the "last updated" time for each Sphinx page from Git."""
  2. from collections import defaultdict
  3. from contextlib import suppress
  4. from datetime import datetime, timezone
  5. from pathlib import Path
  6. import subprocess
  7. from sphinx.locale import _
  8. from sphinx.util.i18n import format_date
  9. from sphinx.util.logging import getLogger
  10. from sphinx.util.matching import Matcher
  11. try:
  12. from sphinx.util.display import status_iterator
  13. except ImportError:
  14. # For older Sphinx versions, will be removed in Sphinx 8:
  15. from sphinx.util import status_iterator
  16. __version__ = '0.3.8'
  17. logger = getLogger(__name__)
  18. def update_file_dates(git_dir, exclude_commits, file_dates):
  19. """Ask Git for "author date" of given files in given directory.
  20. A git subprocess is executed at most three times:
  21. * First, to check which of the files are even managed by Git.
  22. * With only those files (if any), a "git log" is created and parsed
  23. until all requested files have been found.
  24. * If the root commit is reached (i.e. there is at least one of the
  25. requested files that has never been edited since the root commit),
  26. git is called again to check whether the repo is "shallow".
  27. """
  28. requested_files = set(file_dates)
  29. assert requested_files
  30. existing_files = subprocess.check_output(
  31. [
  32. 'git', 'ls-tree', '--name-only', '-z', 'HEAD',
  33. '--', *requested_files
  34. ],
  35. cwd=git_dir,
  36. stderr=subprocess.PIPE,
  37. ).rstrip().rstrip(b'\0')
  38. if not existing_files:
  39. return # None of the requested files are under version control
  40. existing_files = existing_files.decode('utf-8').split('\0')
  41. requested_files.intersection_update(existing_files)
  42. assert requested_files
  43. process = subprocess.Popen(
  44. [
  45. 'git', 'log', '--pretty=format:%n%at%x00%H%x00%P',
  46. '--author-date-order', '--relative', '--name-only',
  47. '--no-show-signature', '-z', '-m', '--', *requested_files
  48. ],
  49. cwd=git_dir,
  50. stdout=subprocess.PIPE,
  51. # NB: We ignore stderr to avoid deadlocks when reading stdout
  52. )
  53. with process:
  54. parse_log(process.stdout, requested_files,
  55. git_dir, exclude_commits, file_dates)
  56. # We don't need the rest of the log if there's something left:
  57. process.terminate()
  58. def parse_log(stream, requested_files, git_dir, exclude_commits, file_dates):
  59. requested_files = set(f.encode('utf-8') for f in requested_files)
  60. line0 = stream.readline()
  61. # First line is blank
  62. assert not line0.rstrip(), 'unexpected git output in {}: {}'.format(
  63. git_dir, line0)
  64. while requested_files:
  65. line1 = stream.readline()
  66. if not line1:
  67. msg = 'end of git log in {}, unhandled files: {}'
  68. assert exclude_commits, msg.format(
  69. git_dir, requested_files)
  70. msg = 'unhandled files in {}: {}, due to excluded commits: {}'
  71. logger.warning(
  72. msg.format(git_dir, requested_files, exclude_commits),
  73. type='git', subtype='unhandled_files')
  74. break
  75. pieces = line1.rstrip().split(b'\0')
  76. assert len(pieces) == 3, 'invalid git info in {}: {}'.format(
  77. git_dir, line1)
  78. timestamp, commit, parent_commits = pieces
  79. line2 = stream.readline().rstrip()
  80. assert line2.endswith(b'\0'), 'unexpected file list in {}: {}'.format(
  81. git_dir, line2)
  82. line2 = line2.rstrip(b'\0')
  83. assert line2, 'no changed files in {} (parent commit(s): {})'.format(
  84. git_dir, parent_commits)
  85. changed_files = line2.split(b'\0')
  86. if commit in exclude_commits:
  87. continue
  88. too_shallow = False
  89. if not parent_commits:
  90. is_shallow = subprocess.check_output(
  91. # --is-shallow-repository is available since Git 2.15.
  92. ['git', 'rev-parse', '--is-shallow-repository'],
  93. cwd=git_dir,
  94. stderr=subprocess.PIPE,
  95. ).rstrip()
  96. if is_shallow == b'true':
  97. too_shallow = True
  98. for file in changed_files:
  99. try:
  100. requested_files.remove(file)
  101. except KeyError:
  102. continue
  103. else:
  104. file_dates[file.decode('utf-8')] = timestamp, too_shallow
  105. def _env_updated(app, env):
  106. # NB: We call git once per sub-directory, because each one could
  107. # potentially be a separate Git repo (or at least a submodule)!
  108. def to_relpath(f: Path) -> str:
  109. with suppress(ValueError):
  110. f = f.relative_to(app.srcdir)
  111. return str(f)
  112. src_paths = {}
  113. src_dates = defaultdict(dict)
  114. excluded = Matcher(app.config.git_exclude_patterns)
  115. exclude_commits = set(
  116. map(lambda h: h.encode('utf-8'), app.config.git_exclude_commits))
  117. for docname, data in env.git_last_updated.items():
  118. if data is not None:
  119. continue # No need to update this source file
  120. if excluded(env.doc2path(docname, False)):
  121. continue
  122. srcfile = Path(env.doc2path(docname)).resolve()
  123. src_dates[srcfile.parent][srcfile.name] = None
  124. src_paths[docname] = srcfile.parent, srcfile.name
  125. srcdir_iter = status_iterator(
  126. src_dates, 'getting Git timestamps for source files... ',
  127. 'fuchsia', len(src_dates), app.verbosity, stringify_func=to_relpath)
  128. for git_dir in srcdir_iter:
  129. try:
  130. update_file_dates(git_dir, exclude_commits, src_dates[git_dir])
  131. except subprocess.CalledProcessError as e:
  132. msg = 'Error getting data from Git'
  133. msg += ' (no "last updated" dates will be shown'
  134. msg += ' for source files from {})'.format(git_dir)
  135. if e.stderr:
  136. msg += ':\n' + e.stderr.decode('utf-8')
  137. logger.warning(msg, type='git', subtype='subprocess_error')
  138. except FileNotFoundError as e:
  139. logger.warning(
  140. '"git" command not found, '
  141. 'no "last updated" dates will be shown',
  142. type='git', subtype='command_not_found')
  143. return
  144. dep_paths = defaultdict(list)
  145. dep_dates = defaultdict(dict)
  146. candi_dates = defaultdict(list)
  147. show_sourcelink = {}
  148. for docname, (src_dir, filename) in src_paths.items():
  149. show_sourcelink[docname] = True
  150. date = src_dates[src_dir][filename]
  151. if date is None:
  152. if not app.config.git_untracked_show_sourcelink:
  153. show_sourcelink[docname] = False
  154. if not app.config.git_untracked_check_dependencies:
  155. continue
  156. else:
  157. candi_dates[docname].append(date)
  158. for dep in env.dependencies[docname]:
  159. # NB: dependencies are relative to srcdir and may contain ".."!
  160. if excluded(dep):
  161. continue
  162. depfile = Path(env.srcdir, dep).resolve()
  163. if not depfile.exists():
  164. logger.warning(
  165. "Dependency file %r, doesn't exist, skipping",
  166. depfile,
  167. location=docname,
  168. type='git',
  169. subtype='dependency_not_found',
  170. )
  171. continue
  172. dep_dates[depfile.parent][depfile.name] = None
  173. dep_paths[docname].append((depfile.parent, depfile.name))
  174. depdir_iter = status_iterator(
  175. dep_dates, 'getting Git timestamps for dependencies... ',
  176. 'turquoise', len(dep_dates), app.verbosity, stringify_func=to_relpath)
  177. for git_dir in depdir_iter:
  178. try:
  179. update_file_dates(git_dir, exclude_commits, dep_dates[git_dir])
  180. except subprocess.CalledProcessError as e:
  181. pass # We ignore errors in dependencies
  182. for docname, deps in dep_paths.items():
  183. for dep_dir, filename in deps:
  184. date = dep_dates[dep_dir][filename]
  185. if date is None:
  186. continue
  187. candi_dates[docname].append(date)
  188. for docname in src_paths:
  189. timestamps = candi_dates[docname]
  190. if timestamps:
  191. # NB: too_shallow is only relevant if it affects the latest date.
  192. timestamp, too_shallow = max(timestamps)
  193. if too_shallow:
  194. timestamp = None
  195. logger.warning(
  196. 'Git clone too shallow', location=docname,
  197. type='git', subtype='too_shallow')
  198. else:
  199. timestamp = None
  200. env.git_last_updated[docname] = timestamp, show_sourcelink[docname]
  201. def _html_page_context(app, pagename, templatename, context, doctree):
  202. context['last_updated'] = None
  203. lufmt = app.config.html_last_updated_fmt
  204. if lufmt is None or 'sourcename' not in context:
  205. return
  206. if 'page_source_suffix' not in context:
  207. # This happens in 'singlehtml' builders
  208. assert context['sourcename'] == ''
  209. return
  210. data = app.env.git_last_updated[pagename]
  211. if data is None:
  212. # There was a problem with git, a warning has already been issued
  213. timestamp = None
  214. show_sourcelink = False
  215. else:
  216. timestamp, show_sourcelink = data
  217. if not show_sourcelink:
  218. del context['sourcename']
  219. del context['page_source_suffix']
  220. if timestamp is None:
  221. return
  222. utc_date = datetime.fromtimestamp(int(timestamp), timezone.utc)
  223. date = utc_date.astimezone(app.config.git_last_updated_timezone)
  224. context['last_updated'] = format_date(
  225. lufmt or _('%b %d, %Y'),
  226. date=date,
  227. language=app.config.language)
  228. if app.config.git_last_updated_metatags:
  229. context['metatags'] += """
  230. <meta property="article:modified_time" content="{}" />""".format(
  231. date.isoformat())
  232. def _config_inited(app, config):
  233. if config.html_last_updated_fmt is None:
  234. config.html_last_updated_fmt = ''
  235. if isinstance(config.git_last_updated_timezone, str):
  236. from babel.dates import get_timezone
  237. config.git_last_updated_timezone = get_timezone(
  238. config.git_last_updated_timezone)
  239. def _builder_inited(app):
  240. env = app.env
  241. if not hasattr(env, 'git_last_updated'):
  242. env.git_last_updated = {}
  243. def _source_read(app, docname, source):
  244. env = app.env
  245. if docname not in env.found_docs:
  246. # Since Sphinx 7.2, "docname" can be None or a relative path
  247. # to a file included with the "include" directive.
  248. # We are only interested in actual source documents.
  249. return
  250. if docname in env.git_last_updated:
  251. # Again since Sphinx 7.2, the source-read hook can be called
  252. # multiple times when using the "include" directive.
  253. return
  254. env.git_last_updated[docname] = None
  255. def _env_merge_info(app, env, docnames, other):
  256. env.git_last_updated.update(other.git_last_updated)
  257. def _env_purge_doc(app, env, docname):
  258. try:
  259. del env.git_last_updated[docname]
  260. except KeyError:
  261. pass
  262. def setup(app):
  263. """Sphinx extension entry point."""
  264. app.require_sphinx('1.8') # For "config-inited" event
  265. app.connect('html-page-context', _html_page_context)
  266. app.connect('config-inited', _config_inited)
  267. app.connect('env-updated', _env_updated)
  268. app.connect('builder-inited', _builder_inited)
  269. app.connect('source-read', _source_read)
  270. app.connect('env-merge-info', _env_merge_info)
  271. app.connect('env-purge-doc', _env_purge_doc)
  272. app.add_config_value(
  273. 'git_untracked_check_dependencies', True, rebuild='env')
  274. app.add_config_value(
  275. 'git_untracked_show_sourcelink', False, rebuild='env')
  276. app.add_config_value(
  277. 'git_last_updated_timezone', None, rebuild='env')
  278. app.add_config_value(
  279. 'git_last_updated_metatags', True, rebuild='html')
  280. app.add_config_value('git_exclude_patterns', [], rebuild='env')
  281. app.add_config_value(
  282. 'git_exclude_commits', [], rebuild='env')
  283. return {
  284. 'version': __version__,
  285. 'parallel_read_safe': True,
  286. 'env_version': 1,
  287. }