2
0

markdown_handler.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313
  1. import re
  2. import sys
  3. from collections import defaultdict
  4. from .file_handler import setup_console_encoding
  5. def compare_markdown_syntax_trees(source_content, target_content, file_path):
  6. """
  7. Build Markdown syntax trees and compare the syntax structure of two documents,
  8. identifying inconsistent positions.
  9. Args:
  10. source_content (str): Source version Markdown content
  11. target_content (str): Target version Markdown content
  12. file_path (str): File path for error messages
  13. Returns:
  14. str: Consistency check result, "Consistent" if consistent, otherwise error message
  15. """
  16. # Build syntax tree for source version
  17. source_tree = build_markdown_syntax_tree(source_content)
  18. # Build syntax tree for target version
  19. target_tree = build_markdown_syntax_tree(target_content)
  20. # Compare syntax trees
  21. differences = compare_syntax_trees(source_tree, target_tree, file_path)
  22. if differences:
  23. return f"Inconsistent: {differences}"
  24. else:
  25. return "Consistent"
  26. def build_markdown_syntax_tree(content):
  27. """
  28. Build a syntax tree for a Markdown document, with each token as a node.
  29. Args:
  30. content (str): Markdown document content
  31. Returns:
  32. list: Syntax tree represented as a list of nodes
  33. """
  34. if not content:
  35. return []
  36. # Split document into lines
  37. lines = content.split('\n')
  38. # Initialize syntax tree
  39. tree = []
  40. # Process each line
  41. for line_num, line in enumerate(lines, 1):
  42. # Skip empty lines
  43. if not line.strip():
  44. continue
  45. # Check if it's a header
  46. header_match = re.match(r'^(#{1,6})\s+(.*)', line)
  47. if header_match:
  48. level = len(header_match.group(1))
  49. text = header_match.group(2)
  50. tree.append({
  51. 'type': 'header',
  52. 'level': level,
  53. 'text': text,
  54. 'line': line_num
  55. })
  56. continue
  57. # Check if it's a list item
  58. list_match = re.match(r'^(\s*)([*+-]|\d+\.)\s+(.*)', line)
  59. if list_match:
  60. indent = len(list_match.group(1))
  61. marker = list_match.group(2)
  62. text = list_match.group(3)
  63. tree.append({
  64. 'type': 'list',
  65. 'indent': indent,
  66. 'marker': marker,
  67. 'text': text,
  68. 'line': line_num
  69. })
  70. continue
  71. # Check if it's a code block
  72. if line.strip().startswith('```'):
  73. tree.append({
  74. 'type': 'code_block',
  75. 'text': line.strip(),
  76. 'line': line_num
  77. })
  78. continue
  79. # Check if it's a blockquote
  80. if line.strip().startswith('>'):
  81. tree.append({
  82. 'type': 'blockquote',
  83. 'text': line.strip(),
  84. 'line': line_num
  85. })
  86. continue
  87. # Check if it's a horizontal rule
  88. if line.strip() in ['---', '***', '___']:
  89. tree.append({
  90. 'type': 'hr',
  91. 'line': line_num
  92. })
  93. continue
  94. # Check for inline elements
  95. # Bold text
  96. bold_matches = re.finditer(r'\*\*(.*?)\*\*', line)
  97. for match in bold_matches:
  98. tree.append({
  99. 'type': 'bold',
  100. 'text': match.group(1),
  101. 'line': line_num
  102. })
  103. # Italic text
  104. italic_matches = re.finditer(r'\*(.*?)\*', line)
  105. for match in italic_matches:
  106. tree.append({
  107. 'type': 'italic',
  108. 'text': match.group(1),
  109. 'line': line_num
  110. })
  111. # Inline code
  112. code_matches = re.finditer(r'`(.*?)`', line)
  113. for match in code_matches:
  114. tree.append({
  115. 'type': 'inline_code',
  116. 'text': match.group(1),
  117. 'line': line_num
  118. })
  119. # Links
  120. link_matches = re.finditer(r'\[([^\]]+)\]\(([^)]+)\)', line)
  121. for match in link_matches:
  122. tree.append({
  123. 'type': 'link',
  124. 'text': match.group(1),
  125. 'url': match.group(2),
  126. 'line': line_num
  127. })
  128. # Images
  129. img_matches = re.finditer(r'!\[([^\]]*)\]\(([^)]+)\)', line)
  130. for match in img_matches:
  131. tree.append({
  132. 'type': 'image',
  133. 'alt': match.group(1),
  134. 'url': match.group(2),
  135. 'line': line_num
  136. })
  137. return tree
  138. def compare_syntax_trees(source_tree, target_tree, file_path):
  139. """
  140. Compare two Markdown syntax trees, locate inconsistent positions by checking tree structure.
  141. Args:
  142. source_tree (list): Source version syntax tree
  143. target_tree (list): Target version syntax tree
  144. file_path (str): File path for error messages
  145. Returns:
  146. str: Error message describing differences, empty string if consistent
  147. """
  148. # Set console encoding to resolve character display issues
  149. setup_console_encoding()
  150. # Check if tree lengths are consistent
  151. if len(source_tree) != len(target_tree):
  152. return f"Number of elements inconsistent (Source: {len(source_tree)}, Target: {len(target_tree)})"
  153. # Check each element
  154. for i, (source_node, target_node) in enumerate(zip(source_tree, target_tree)):
  155. # Check if node types are consistent
  156. if source_node['type'] != target_node['type']:
  157. return f"Element type inconsistent at position {i+1} (Source: {source_node['type']}, Target: {target_node['type']})"
  158. # Check specific attributes based on node type
  159. if source_node['type'] == 'header':
  160. if source_node['level'] != target_node['level']:
  161. return f"Header level inconsistent at line {source_node['line']} (Source: {source_node['level']}, Target: {target_node['level']})"
  162. elif source_node['type'] == 'list':
  163. if source_node['indent'] != target_node['indent']:
  164. return f"List indent inconsistent at line {source_node['line']} (Source: {source_node['indent']}, Target: {target_node['indent']})"
  165. if source_node['marker'] != target_node['marker']:
  166. return f"List marker inconsistent at line {source_node['line']} (Source: {source_node['marker']}, Target: {target_node['marker']})"
  167. elif source_node['type'] == 'code_block':
  168. # For code blocks, we only check if both are code blocks, not content
  169. pass
  170. elif source_node['type'] == 'blockquote':
  171. # For blockquotes, we only check if both are blockquotes, not content
  172. pass
  173. elif source_node['type'] == 'hr':
  174. # For horizontal rules, we only check if both are horizontal rules
  175. pass
  176. elif source_node['type'] in ['bold', 'italic', 'inline_code']:
  177. # For inline elements, we only check if both are the same type
  178. pass
  179. elif source_node['type'] == 'link':
  180. # For links, check if URLs are consistent
  181. if source_node['url'] != target_node['url']:
  182. return f"Link URL inconsistent at line {source_node['line']} (Source: {source_node['url']}, Target: {target_node['url']})"
  183. elif source_node['type'] == 'image':
  184. # For images, check if URLs are consistent
  185. if source_node['url'] != target_node['url']:
  186. return f"Image URL inconsistent at line {source_node['line']} (Source: {source_node['url']}, Target: {target_node['url']})"
  187. # If no differences found, return empty string
  188. return ""
  189. def split_document_by_headers(content):
  190. """
  191. Split document into multiple parts based on headers.
  192. Args:
  193. content (str): Markdown document content
  194. Returns:
  195. dict: Dictionary with headers as keys and content as values
  196. """
  197. if not content:
  198. return {}
  199. # Split document into lines
  200. lines = content.split('\n')
  201. # Initialize result dictionary
  202. sections = defaultdict(list)
  203. # Current section header
  204. current_header = "Introduction"
  205. # Process each line
  206. for line in lines:
  207. # Check if it's a header
  208. header_match = re.match(r'^(#{1,6})\s+(.*)', line)
  209. if header_match:
  210. # Update current section header
  211. current_header = header_match.group(2)
  212. # Add line to current section
  213. sections[current_header].append(line)
  214. # Convert lists to strings
  215. for header, lines in sections.items():
  216. sections[header] = '\n'.join(lines)
  217. return sections
  218. def compare_section_content(source_sections, target_sections):
  219. """
  220. Compare content under two header nodes.
  221. Args:
  222. source_sections (dict): Source version document sections
  223. target_sections (dict): Target version document sections
  224. Returns:
  225. dict: Dictionary with comparison results for each section
  226. """
  227. results = {}
  228. # Get all unique section headers
  229. all_headers = set(source_sections.keys()).union(set(target_sections.keys()))
  230. # Compare each section
  231. for header in all_headers:
  232. source_exists = header in source_sections
  233. target_exists = header in target_sections
  234. if source_exists and target_exists:
  235. # Both sections exist, check content structure
  236. source_tree = build_markdown_syntax_tree(source_sections[header])
  237. target_tree = build_markdown_syntax_tree(target_sections[header])
  238. differences = compare_syntax_trees(source_tree, target_tree, header)
  239. if differences:
  240. results[header] = f"Inconsistent: {differences}"
  241. else:
  242. results[header] = "Consistent"
  243. elif source_exists:
  244. # Only source version exists
  245. results[header] = "Source Only"
  246. elif target_exists:
  247. # Only target version exists
  248. results[header] = "Target Only"
  249. return results