defold
/
doc
spiegel van https://github.com/defold/doc.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313
							import re
import sys
from collections import defaultdict

from .file_handler import setup_console_encoding


def compare_markdown_syntax_trees(source_content, target_content, file_path):
    """
    Build Markdown syntax trees and compare the syntax structure of two documents,
    identifying inconsistent positions.
    
    Args:
        source_content (str): Source version Markdown content
        target_content (str): Target version Markdown content
        file_path (str): File path for error messages
        
    Returns:
        str: Consistency check result, "Consistent" if consistent, otherwise error message
    """
    # Build syntax tree for source version
    source_tree = build_markdown_syntax_tree(source_content)
    
    # Build syntax tree for target version
    target_tree = build_markdown_syntax_tree(target_content)
    
    # Compare syntax trees
    differences = compare_syntax_trees(source_tree, target_tree, file_path)
    
    if differences:
        return f"Inconsistent: {differences}"
    else:
        return "Consistent"


def build_markdown_syntax_tree(content):
    """
    Build a syntax tree for a Markdown document, with each token as a node.
    
    Args:
        content (str): Markdown document content
        
    Returns:
        list: Syntax tree represented as a list of nodes
    """
    if not content:
        return []
    
    # Split document into lines
    lines = content.split('\n')
    
    # Initialize syntax tree
    tree = []
    
    # Process each line
    for line_num, line in enumerate(lines, 1):
        # Skip empty lines
        if not line.strip():
            continue
            
        # Check if it's a header
        header_match = re.match(r'^(#{1,6})\s+(.*)', line)
        if header_match:
            level = len(header_match.group(1))
            text = header_match.group(2)
            tree.append({
                'type': 'header',
                'level': level,
                'text': text,
                'line': line_num
            })
            continue
            
        # Check if it's a list item
        list_match = re.match(r'^(\s*)([*+-]|\d+\.)\s+(.*)', line)
        if list_match:
            indent = len(list_match.group(1))
            marker = list_match.group(2)
            text = list_match.group(3)
            tree.append({
                'type': 'list',
                'indent': indent,
                'marker': marker,
                'text': text,
                'line': line_num
            })
            continue
            
        # Check if it's a code block
        if line.strip().startswith('```'):
            tree.append({
                'type': 'code_block',
                'text': line.strip(),
                'line': line_num
            })
            continue
            
        # Check if it's a blockquote
        if line.strip().startswith('>'):
            tree.append({
                'type': 'blockquote',
                'text': line.strip(),
                'line': line_num
            })
            continue
            
        # Check if it's a horizontal rule
        if line.strip() in ['---', '***', '___']:
            tree.append({
                'type': 'hr',
                'line': line_num
            })
            continue
            
        # Check for inline elements
        # Bold text
        bold_matches = re.finditer(r'\*\*(.*?)\*\*', line)
        for match in bold_matches:
            tree.append({
                'type': 'bold',
                'text': match.group(1),
                'line': line_num
            })
            
        # Italic text
        italic_matches = re.finditer(r'\*(.*?)\*', line)
        for match in italic_matches:
            tree.append({
                'type': 'italic',
                'text': match.group(1),
                'line': line_num
            })
            
        # Inline code
        code_matches = re.finditer(r'`(.*?)`', line)
        for match in code_matches:
            tree.append({
                'type': 'inline_code',
                'text': match.group(1),
                'line': line_num
            })
            
        # Links
        link_matches = re.finditer(r'\[([^\]]+)\]\(([^)]+)\)', line)
        for match in link_matches:
            tree.append({
                'type': 'link',
                'text': match.group(1),
                'url': match.group(2),
                'line': line_num
            })
            
        # Images
        img_matches = re.finditer(r'!\[([^\]]*)\]\(([^)]+)\)', line)
        for match in img_matches:
            tree.append({
                'type': 'image',
                'alt': match.group(1),
                'url': match.group(2),
                'line': line_num
            })
    
    return tree


def compare_syntax_trees(source_tree, target_tree, file_path):
    """
    Compare two Markdown syntax trees, locate inconsistent positions by checking tree structure.
    
    Args:
        source_tree (list): Source version syntax tree
        target_tree (list): Target version syntax tree
        file_path (str): File path for error messages
        
    Returns:
        str: Error message describing differences, empty string if consistent
    """
    # Set console encoding to resolve character display issues
    setup_console_encoding()
    
    # Check if tree lengths are consistent
    if len(source_tree) != len(target_tree):
        return f"Number of elements inconsistent (Source: {len(source_tree)}, Target: {len(target_tree)})"
    
    # Check each element
    for i, (source_node, target_node) in enumerate(zip(source_tree, target_tree)):
        # Check if node types are consistent
        if source_node['type'] != target_node['type']:
            return f"Element type inconsistent at position {i+1} (Source: {source_node['type']}, Target: {target_node['type']})"
        
        # Check specific attributes based on node type
        if source_node['type'] == 'header':
            if source_node['level'] != target_node['level']:
                return f"Header level inconsistent at line {source_node['line']} (Source: {source_node['level']}, Target: {target_node['level']})"
                
        elif source_node['type'] == 'list':
            if source_node['indent'] != target_node['indent']:
                return f"List indent inconsistent at line {source_node['line']} (Source: {source_node['indent']}, Target: {target_node['indent']})"
            if source_node['marker'] != target_node['marker']:
                return f"List marker inconsistent at line {source_node['line']} (Source: {source_node['marker']}, Target: {target_node['marker']})"
                
        elif source_node['type'] == 'code_block':
            # For code blocks, we only check if both are code blocks, not content
            pass
            
        elif source_node['type'] == 'blockquote':
            # For blockquotes, we only check if both are blockquotes, not content
            pass
            
        elif source_node['type'] == 'hr':
            # For horizontal rules, we only check if both are horizontal rules
            pass
            
        elif source_node['type'] in ['bold', 'italic', 'inline_code']:
            # For inline elements, we only check if both are the same type
            pass
            
        elif source_node['type'] == 'link':
            # For links, check if URLs are consistent
            if source_node['url'] != target_node['url']:
                return f"Link URL inconsistent at line {source_node['line']} (Source: {source_node['url']}, Target: {target_node['url']})"
                
        elif source_node['type'] == 'image':
            # For images, check if URLs are consistent
            if source_node['url'] != target_node['url']:
                return f"Image URL inconsistent at line {source_node['line']} (Source: {source_node['url']}, Target: {target_node['url']})"
    
    # If no differences found, return empty string
    return ""


def split_document_by_headers(content):
    """
    Split document into multiple parts based on headers.
    
    Args:
        content (str): Markdown document content
        
    Returns:
        dict: Dictionary with headers as keys and content as values
    """
    if not content:
        return {}
    
    # Split document into lines
    lines = content.split('\n')
    
    # Initialize result dictionary
    sections = defaultdict(list)
    
    # Current section header
    current_header = "Introduction"
    
    # Process each line
    for line in lines:
        # Check if it's a header
        header_match = re.match(r'^(#{1,6})\s+(.*)', line)
        if header_match:
            # Update current section header
            current_header = header_match.group(2)
        
        # Add line to current section
        sections[current_header].append(line)
    
    # Convert lists to strings
    for header, lines in sections.items():
        sections[header] = '\n'.join(lines)
    
    return sections


def compare_section_content(source_sections, target_sections):
    """
    Compare content under two header nodes.
    
    Args:
        source_sections (dict): Source version document sections
        target_sections (dict): Target version document sections
        
    Returns:
        dict: Dictionary with comparison results for each section
    """
    results = {}
    
    # Get all unique section headers
    all_headers = set(source_sections.keys()).union(set(target_sections.keys()))
    
    # Compare each section
    for header in all_headers:
        source_exists = header in source_sections
        target_exists = header in target_sections
        
        if source_exists and target_exists:
            # Both sections exist, check content structure
            source_tree = build_markdown_syntax_tree(source_sections[header])
            target_tree = build_markdown_syntax_tree(target_sections[header])
            
            differences = compare_syntax_trees(source_tree, target_tree, header)
            
            if differences:
                results[header] = f"Inconsistent: {differences}"
            else:
                results[header] = "Consistent"
                
        elif source_exists:
            # Only source version exists
            results[header] = "Source Only"
            
        elif target_exists:
            # Only target version exists
            results[header] = "Target Only"
    
    return results