docs_consistency_checker.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Document Consistency Checker
  5. Uses modules in the scripts/modules directory to implement document consistency checking functionality
  6. """
  7. import os
  8. import sys
  9. import argparse
  10. # Import modules
  11. from modules.main import main
  12. from modules.file_handler import setup_console_encoding
  13. from modules.markdown_handler import compare_markdown_syntax_trees
  14. from modules.file_handler import read_file_content
  15. def run_docs_consistency_check(source_dir=None, target_dir=None, output_file=None, specific_file=None, source_file=None, target_file=None):
  16. """
  17. Run document consistency check between two language directories
  18. Parameters:
  19. source_dir: Source document directory path
  20. target_dir: Target document directory path
  21. output_file: Output Excel file path
  22. specific_file: Specific file path to check (relative to docs directory)
  23. source_file: Specified source version file path
  24. target_file: Specified target version file path
  25. """
  26. # Set console encoding to resolve character display issues
  27. setup_console_encoding()
  28. # If specific source and target file paths are specified, use these paths
  29. if source_file and target_file:
  30. print(f"Checking files: Source version {source_file}, Target version {target_file}")
  31. # Check if files exist
  32. if not os.path.exists(source_file):
  33. print(f"Error: Source file does not exist: {source_file}")
  34. return
  35. if not os.path.exists(target_file):
  36. print(f"Error: Target file does not exist: {target_file}")
  37. return
  38. # Read file contents
  39. source_content = read_file_content(source_file)
  40. target_content = read_file_content(target_file)
  41. if source_content is None:
  42. print(f"Error: Unable to read source file: {source_file}")
  43. return
  44. if target_content is None:
  45. print(f"Error: Unable to read target file: {target_file}")
  46. return
  47. # Check if they are Markdown files
  48. if not (source_file.endswith('.md') and target_file.endswith('.md')):
  49. print(f"Warning: Files are not Markdown files, skipping syntax tree comparison")
  50. return
  51. # Compare Markdown syntax trees
  52. print(f"Comparing Markdown syntax trees of files...")
  53. inconsistencies = compare_markdown_syntax_trees(source_content, target_content, os.path.basename(source_file))
  54. # Output results
  55. if inconsistencies and inconsistencies != "Consistent":
  56. # Check if it's a return value from syntax tree comparison error
  57. if inconsistencies.startswith("Syntax tree comparison error:"):
  58. print(inconsistencies)
  59. else:
  60. # Split the inconsistency information separated by semicolons into a list, but preserve special separator |ERROR_SEPARATOR|
  61. # First check if there is [ERROR_COUNT:1] tag
  62. if "[ERROR_COUNT:1]" in inconsistencies:
  63. # If there is a tag, add the entire error as an element to the list
  64. issues = [inconsistencies]
  65. else:
  66. # If no tag, split normally, supporting both semicolon and newline as separators
  67. # Replace all newlines with semicolon+space, then split uniformly
  68. temp_inconsistencies = inconsistencies.replace("\n", "; ")
  69. issues = temp_inconsistencies.split("; ")
  70. # Calculate actual error count, considering special tag [ERROR_COUNT:1]
  71. error_count = 0
  72. formatted_issues = []
  73. for issue in issues:
  74. # Check if there is an error count tag
  75. if "[ERROR_COUNT:1]" in issue:
  76. error_count += 1
  77. # Remove tag and replace newlines with semicolons, then add to formatted issues list
  78. formatted_issue = issue.replace("[ERROR_COUNT:1]", "").replace("\n", "; ")
  79. formatted_issues.append(formatted_issue)
  80. else:
  81. # If no tag, check if it's a sub-error of a heading node (already counted error)
  82. # Check if the previous formatted issue contains "heading node" and "errors under:"
  83. is_sub_error = False
  84. for prev_issue in formatted_issues:
  85. if "heading node" in prev_issue and "errors under:" in prev_issue:
  86. is_sub_error = True
  87. break
  88. if not is_sub_error:
  89. error_count += 1
  90. # Ensure all newlines in issues are replaced with semicolons
  91. formatted_issue = issue.replace("\n", "; ")
  92. formatted_issues.append(formatted_issue)
  93. else:
  94. # Ensure all newlines in issues are replaced with semicolons
  95. formatted_issue = issue.replace("\n", "; ")
  96. formatted_issues.append(formatted_issue)
  97. print(f"Found {error_count} inconsistency issues:")
  98. for i, issue in enumerate(formatted_issues, 1):
  99. print(f"{i}. {issue}")
  100. else:
  101. print("No inconsistency issues found, document structure is consistent")
  102. return
  103. # Check if source and target directories are provided for directory-based operations
  104. if source_dir is None and target_dir is None:
  105. # Try to use current working directory as a fallback
  106. cwd = os.getcwd()
  107. if os.path.exists(os.path.join(cwd, "docs")):
  108. source_dir = os.path.join(cwd, "docs", "source")
  109. target_dir = os.path.join(cwd, "docs", "target")
  110. print(f"Using default directories based on current working directory:")
  111. print(f" Source directory: {source_dir}")
  112. print(f" Target directory: {target_dir}")
  113. else:
  114. print("Error: Source and target directories must be specified.")
  115. print("Usage examples:")
  116. print(" python docs_consistency_checker.py --source-dir ./docs/en --target-dir ./docs/zh")
  117. print(" python docs_consistency_checker.py --source-file ./docs/en/manuals/introduction.md --target-file ./docs/zh/manuals/introduction.md")
  118. print(" python docs_consistency_checker.py --source-dir ./docs/en --target-dir ./docs/zh --file manuals/introduction.md")
  119. return
  120. # Check if source directory exists
  121. if source_dir and not os.path.exists(source_dir):
  122. print(f"Error: Source directory does not exist: {source_dir}")
  123. return
  124. # Check if target directory exists
  125. if target_dir and not os.path.exists(target_dir):
  126. print(f"Error: Target directory does not exist: {target_dir}")
  127. return
  128. if output_file is None:
  129. output_file = "docs_structure_comparison.xlsx"
  130. # If a specific file is specified, only check that file
  131. if specific_file:
  132. print(f"Checking specific file: {specific_file}")
  133. # Build complete file paths
  134. source_file_path = os.path.join(source_dir, specific_file)
  135. target_file_path = os.path.join(target_dir, specific_file)
  136. # Check if files exist
  137. if not os.path.exists(source_file_path):
  138. print(f"Error: Source file does not exist: {source_file_path}")
  139. return
  140. if not os.path.exists(target_file_path):
  141. print(f"Error: Target file does not exist: {target_file_path}")
  142. return
  143. # Read file contents
  144. source_content = read_file_content(source_file_path)
  145. target_content = read_file_content(target_file_path)
  146. if source_content is None:
  147. print(f"Error: Unable to read source file: {source_file_path}")
  148. return
  149. if target_content is None:
  150. print(f"Error: Unable to read target file: {target_file_path}")
  151. return
  152. # Check if it's a Markdown file
  153. if not specific_file.endswith('.md'):
  154. print(f"Warning: File {specific_file} is not a Markdown file, skipping syntax tree comparison")
  155. return
  156. # Compare Markdown syntax trees
  157. print(f"Comparing Markdown syntax trees of {specific_file}...")
  158. inconsistencies = compare_markdown_syntax_trees(source_content, target_content, specific_file)
  159. # Output results
  160. if inconsistencies and inconsistencies != "Consistent":
  161. # Check if it's a return value from syntax tree comparison error
  162. if inconsistencies.startswith("Syntax tree comparison error:"):
  163. print(inconsistencies)
  164. else:
  165. # Split the inconsistency information separated by semicolons into a list, but preserve special separator |ERROR_SEPARATOR|
  166. # First check if there is [ERROR_COUNT:1] tag
  167. if "[ERROR_COUNT:1]" in inconsistencies:
  168. # If there is a tag, add the entire error as an element to the list
  169. issues = [inconsistencies]
  170. else:
  171. # If no tag, split normally, supporting both semicolon and newline as separators
  172. # Replace all newlines with semicolon+space, then split uniformly
  173. temp_inconsistencies = inconsistencies.replace("\n", "; ")
  174. issues = temp_inconsistencies.split("; ")
  175. # Calculate actual error count, considering special tag [ERROR_COUNT:1]
  176. error_count = 0
  177. formatted_issues = []
  178. for issue in issues:
  179. # Check if there is an error count tag
  180. if "[ERROR_COUNT:1]" in issue:
  181. error_count += 1
  182. # Remove tag and replace newlines with semicolons, then add to formatted issues list
  183. formatted_issue = issue.replace("[ERROR_COUNT:1]", "").replace("\n", "; ")
  184. formatted_issues.append(formatted_issue)
  185. else:
  186. # If no tag, check if it's a sub-error of a heading node (already counted error)
  187. # Check if the previous formatted issue contains "heading node" and "errors under:"
  188. is_sub_error = False
  189. for prev_issue in formatted_issues:
  190. if "heading node" in prev_issue and "errors under:" in prev_issue:
  191. is_sub_error = True
  192. break
  193. if not is_sub_error:
  194. error_count += 1
  195. # Ensure all newlines in issues are replaced with semicolons
  196. formatted_issue = issue.replace("\n", "; ")
  197. formatted_issues.append(formatted_issue)
  198. else:
  199. # Ensure all newlines in issues are replaced with semicolons
  200. formatted_issue = issue.replace("\n", "; ")
  201. formatted_issues.append(formatted_issue)
  202. print(f"Found {error_count} inconsistency issues:")
  203. for i, issue in enumerate(formatted_issues, 1):
  204. print(f"{i}. {issue}")
  205. else:
  206. print("No inconsistency issues found, document structure is consistent")
  207. else:
  208. # Run main function, passing parameters
  209. main(source_dir_path=source_dir, target_dir_path=target_dir, output_file_path=output_file)
  210. if __name__ == "__main__":
  211. # Parse command line arguments
  212. parser = argparse.ArgumentParser(description="Document Consistency Checker")
  213. parser.add_argument("--file", help="Specify the specific file path to check (relative to docs directory)")
  214. parser.add_argument("--source-dir", help="Source document directory path")
  215. parser.add_argument("--target-dir", help="Target document directory path")
  216. parser.add_argument("--output", help="Output Excel file path")
  217. parser.add_argument("--source-file", help="Specify source version file path")
  218. parser.add_argument("--target-file", help="Specify target version file path")
  219. args = parser.parse_args()
  220. print("Starting document consistency check...")
  221. run_docs_consistency_check(
  222. source_dir=args.source_dir,
  223. target_dir=args.target_dir,
  224. output_file=args.output,
  225. specific_file=args.file,
  226. source_file=args.source_file,
  227. target_file=args.target_file
  228. )
  229. print("Document consistency check completed!")