123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313 |
- import re
- import sys
- from collections import defaultdict
- from .file_handler import setup_console_encoding
- def compare_markdown_syntax_trees(source_content, target_content, file_path):
- """
- Build Markdown syntax trees and compare the syntax structure of two documents,
- identifying inconsistent positions.
-
- Args:
- source_content (str): Source version Markdown content
- target_content (str): Target version Markdown content
- file_path (str): File path for error messages
-
- Returns:
- str: Consistency check result, "Consistent" if consistent, otherwise error message
- """
- # Build syntax tree for source version
- source_tree = build_markdown_syntax_tree(source_content)
-
- # Build syntax tree for target version
- target_tree = build_markdown_syntax_tree(target_content)
-
- # Compare syntax trees
- differences = compare_syntax_trees(source_tree, target_tree, file_path)
-
- if differences:
- return f"Inconsistent: {differences}"
- else:
- return "Consistent"
- def build_markdown_syntax_tree(content):
- """
- Build a syntax tree for a Markdown document, with each token as a node.
-
- Args:
- content (str): Markdown document content
-
- Returns:
- list: Syntax tree represented as a list of nodes
- """
- if not content:
- return []
-
- # Split document into lines
- lines = content.split('\n')
-
- # Initialize syntax tree
- tree = []
-
- # Process each line
- for line_num, line in enumerate(lines, 1):
- # Skip empty lines
- if not line.strip():
- continue
-
- # Check if it's a header
- header_match = re.match(r'^(#{1,6})\s+(.*)', line)
- if header_match:
- level = len(header_match.group(1))
- text = header_match.group(2)
- tree.append({
- 'type': 'header',
- 'level': level,
- 'text': text,
- 'line': line_num
- })
- continue
-
- # Check if it's a list item
- list_match = re.match(r'^(\s*)([*+-]|\d+\.)\s+(.*)', line)
- if list_match:
- indent = len(list_match.group(1))
- marker = list_match.group(2)
- text = list_match.group(3)
- tree.append({
- 'type': 'list',
- 'indent': indent,
- 'marker': marker,
- 'text': text,
- 'line': line_num
- })
- continue
-
- # Check if it's a code block
- if line.strip().startswith('```'):
- tree.append({
- 'type': 'code_block',
- 'text': line.strip(),
- 'line': line_num
- })
- continue
-
- # Check if it's a blockquote
- if line.strip().startswith('>'):
- tree.append({
- 'type': 'blockquote',
- 'text': line.strip(),
- 'line': line_num
- })
- continue
-
- # Check if it's a horizontal rule
- if line.strip() in ['---', '***', '___']:
- tree.append({
- 'type': 'hr',
- 'line': line_num
- })
- continue
-
- # Check for inline elements
- # Bold text
- bold_matches = re.finditer(r'\*\*(.*?)\*\*', line)
- for match in bold_matches:
- tree.append({
- 'type': 'bold',
- 'text': match.group(1),
- 'line': line_num
- })
-
- # Italic text
- italic_matches = re.finditer(r'\*(.*?)\*', line)
- for match in italic_matches:
- tree.append({
- 'type': 'italic',
- 'text': match.group(1),
- 'line': line_num
- })
-
- # Inline code
- code_matches = re.finditer(r'`(.*?)`', line)
- for match in code_matches:
- tree.append({
- 'type': 'inline_code',
- 'text': match.group(1),
- 'line': line_num
- })
-
- # Links
- link_matches = re.finditer(r'\[([^\]]+)\]\(([^)]+)\)', line)
- for match in link_matches:
- tree.append({
- 'type': 'link',
- 'text': match.group(1),
- 'url': match.group(2),
- 'line': line_num
- })
-
- # Images
- img_matches = re.finditer(r'!\[([^\]]*)\]\(([^)]+)\)', line)
- for match in img_matches:
- tree.append({
- 'type': 'image',
- 'alt': match.group(1),
- 'url': match.group(2),
- 'line': line_num
- })
-
- return tree
- def compare_syntax_trees(source_tree, target_tree, file_path):
- """
- Compare two Markdown syntax trees, locate inconsistent positions by checking tree structure.
-
- Args:
- source_tree (list): Source version syntax tree
- target_tree (list): Target version syntax tree
- file_path (str): File path for error messages
-
- Returns:
- str: Error message describing differences, empty string if consistent
- """
- # Set console encoding to resolve character display issues
- setup_console_encoding()
-
- # Check if tree lengths are consistent
- if len(source_tree) != len(target_tree):
- return f"Number of elements inconsistent (Source: {len(source_tree)}, Target: {len(target_tree)})"
-
- # Check each element
- for i, (source_node, target_node) in enumerate(zip(source_tree, target_tree)):
- # Check if node types are consistent
- if source_node['type'] != target_node['type']:
- return f"Element type inconsistent at position {i+1} (Source: {source_node['type']}, Target: {target_node['type']})"
-
- # Check specific attributes based on node type
- if source_node['type'] == 'header':
- if source_node['level'] != target_node['level']:
- return f"Header level inconsistent at line {source_node['line']} (Source: {source_node['level']}, Target: {target_node['level']})"
-
- elif source_node['type'] == 'list':
- if source_node['indent'] != target_node['indent']:
- return f"List indent inconsistent at line {source_node['line']} (Source: {source_node['indent']}, Target: {target_node['indent']})"
- if source_node['marker'] != target_node['marker']:
- return f"List marker inconsistent at line {source_node['line']} (Source: {source_node['marker']}, Target: {target_node['marker']})"
-
- elif source_node['type'] == 'code_block':
- # For code blocks, we only check if both are code blocks, not content
- pass
-
- elif source_node['type'] == 'blockquote':
- # For blockquotes, we only check if both are blockquotes, not content
- pass
-
- elif source_node['type'] == 'hr':
- # For horizontal rules, we only check if both are horizontal rules
- pass
-
- elif source_node['type'] in ['bold', 'italic', 'inline_code']:
- # For inline elements, we only check if both are the same type
- pass
-
- elif source_node['type'] == 'link':
- # For links, check if URLs are consistent
- if source_node['url'] != target_node['url']:
- return f"Link URL inconsistent at line {source_node['line']} (Source: {source_node['url']}, Target: {target_node['url']})"
-
- elif source_node['type'] == 'image':
- # For images, check if URLs are consistent
- if source_node['url'] != target_node['url']:
- return f"Image URL inconsistent at line {source_node['line']} (Source: {source_node['url']}, Target: {target_node['url']})"
-
- # If no differences found, return empty string
- return ""
- def split_document_by_headers(content):
- """
- Split document into multiple parts based on headers.
-
- Args:
- content (str): Markdown document content
-
- Returns:
- dict: Dictionary with headers as keys and content as values
- """
- if not content:
- return {}
-
- # Split document into lines
- lines = content.split('\n')
-
- # Initialize result dictionary
- sections = defaultdict(list)
-
- # Current section header
- current_header = "Introduction"
-
- # Process each line
- for line in lines:
- # Check if it's a header
- header_match = re.match(r'^(#{1,6})\s+(.*)', line)
- if header_match:
- # Update current section header
- current_header = header_match.group(2)
-
- # Add line to current section
- sections[current_header].append(line)
-
- # Convert lists to strings
- for header, lines in sections.items():
- sections[header] = '\n'.join(lines)
-
- return sections
- def compare_section_content(source_sections, target_sections):
- """
- Compare content under two header nodes.
-
- Args:
- source_sections (dict): Source version document sections
- target_sections (dict): Target version document sections
-
- Returns:
- dict: Dictionary with comparison results for each section
- """
- results = {}
-
- # Get all unique section headers
- all_headers = set(source_sections.keys()).union(set(target_sections.keys()))
-
- # Compare each section
- for header in all_headers:
- source_exists = header in source_sections
- target_exists = header in target_sections
-
- if source_exists and target_exists:
- # Both sections exist, check content structure
- source_tree = build_markdown_syntax_tree(source_sections[header])
- target_tree = build_markdown_syntax_tree(target_sections[header])
-
- differences = compare_syntax_trees(source_tree, target_tree, header)
-
- if differences:
- results[header] = f"Inconsistent: {differences}"
- else:
- results[header] = "Consistent"
-
- elif source_exists:
- # Only source version exists
- results[header] = "Source Only"
-
- elif target_exists:
- # Only target version exists
- results[header] = "Target Only"
-
- return results
|