scripts-smart-merge

#!/usr/bin/env python3 """

title: "Optional: Claude API for LLM analysis" component_type: script version: "1.0.0" audience: contributor status: stable summary: "CODITECT Smart Merge - Intelligent document merging with LLM analysis." keywords: ['analysis', 'api', 'merge', 'review', 'smart'] tokens: ~500 created: 2025-12-22 updated: 2025-12-22 script_name: "smart-merge.py" language: python executable: true usage: "python3 scripts/smart-merge.py [options]" python_version: "3.10+" dependencies: [] modifies_files: false network_access: false requires_auth: false

CODITECT Smart Merge - Intelligent document merging with LLM analysis.

Merges similar documents (ADRs, markdown, code) using structural analysis and optional LLM-powered diff analysis for conflict resolution.

Similarity Classification: IDENTICAL - Raw hash match (byte-for-byte identical) NORMALIZED_DUP - Same content, different whitespace NEAR_DUPLICATE - >95% content similarity (difflib) SIMILAR - 70-95% content similarity RELATED - 50-70% content similarity

Author: AZ1.AI INC (Hal Casteel) Version: 2.0.0 Date: 2025-12-11 """

import argparse import hashlib import json import os import re import subprocess import sys from dataclasses import dataclass, field from datetime import datetime, timezone from difflib import SequenceMatcher from pathlib import Path from typing import Any, Dict, List, Optional, Tuple

Optional: Claude API for LLM analysis

try: import anthropic HAS_ANTHROPIC = True except ImportError: HAS_ANTHROPIC = False

Similarity classification thresholds

class SimilarityLevel: IDENTICAL = "IDENTICAL" # Hash match NORMALIZED_DUP = "NORMALIZED_DUP" # Normalized hash match NEAR_DUPLICATE = "NEAR_DUPLICATE" # >95% content match SIMILAR = "SIMILAR" # 70-95% content match RELATED = "RELATED" # 50-70% content match DIFFERENT = "DIFFERENT" # <50% content match

SIMILARITY_THRESHOLDS = { SimilarityLevel.NEAR_DUPLICATE: 0.95, SimilarityLevel.SIMILAR: 0.70, SimilarityLevel.RELATED: 0.50, }

@dataclass class Section: """Represents a document section.""" level: int # Heading level (1-6) title: str content: str line_start: int line_end: int hash: str = ""

def __post_init__(self):
    self.hash = hashlib.sha256(
        f"{self.title}:{self.content}".encode()
    ).hexdigest()[:16]

@dataclass class MergeConflict: """Represents a merge conflict between sections.""" section_title: str content_a: str content_b: str source_a: str source_b: str resolution: Optional[str] = None resolution_reason: Optional[str] = None

@dataclass class MergeResult: """Result of a merge operation.""" success: bool merged_content: str conflicts: List[MergeConflict] = field(default_factory=list) stats: Dict[str, Any] = field(default_factory=dict) llm_analysis: Optional[str] = None

@dataclass class SimilarityResult: """Result of similarity analysis between two files.""" file_a: Path file_b: Path level: str # SimilarityLevel classification raw_hash_match: bool # Byte-identical normalized_hash_match: bool # Content-identical (whitespace normalized) content_ratio: float # difflib SequenceMatcher ratio (0.0-1.0) raw_hash_a: str raw_hash_b: str normalized_hash_a: str normalized_hash_b: str lines_a: int lines_b: int bytes_a: int bytes_b: int recommendation: str

class SmartMerger: """Intelligent document merger with LLM support."""

HEADING_PATTERN = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE)
METADATA_PATTERNS = {
    'version': re.compile(r'[Vv]ersion[:\s]+(\d+\.\d+\.?\d*)'),
    'date': re.compile(r'[Dd]ate[:\s]+(\d{4}-\d{2}-\d{2})'),
    'status': re.compile(r'[Ss]tatus[:\s]+(\w+)'),
    'author': re.compile(r'[Aa]uthor[:\s]+(.+)'),
}

def __init__(self, use_llm: bool = False, model: str = "claude-sonnet-4-20250514"):
    self.use_llm = use_llm and HAS_ANTHROPIC
    self.model = model
    self.client = None

    if self.use_llm:
        api_key = os.environ.get('ANTHROPIC_API_KEY')
        if api_key:
            self.client = anthropic.Anthropic(api_key=api_key)
        else:
            print("Warning: ANTHROPIC_API_KEY not set, LLM analysis disabled")
            self.use_llm = False

def raw_hash(self, content: str) -> str:
    """Generate SHA-256 hash of raw content (byte-identical check)."""
    return hashlib.sha256(content.encode('utf-8')).hexdigest()

def normalized_hash(self, content: str) -> str:
    """Generate SHA-256 hash of normalized content (whitespace-insensitive)."""
    # Normalize: collapse all whitespace to single space, strip, lowercase
    normalized = re.sub(r'\s+', ' ', content.strip().lower())
    return hashlib.sha256(normalized.encode('utf-8')).hexdigest()

def content_similarity(self, content_a: str, content_b: str) -> float:
    """
    Calculate content similarity using difflib SequenceMatcher.
    Returns ratio from 0.0 (completely different) to 1.0 (identical).
    """
    return SequenceMatcher(None, content_a, content_b).ratio()

def classify_similarity(
    self,
    raw_match: bool,
    normalized_match: bool,
    content_ratio: float
) -> str:
    """Classify similarity level based on hash and content analysis."""
    if raw_match:
        return SimilarityLevel.IDENTICAL
    if normalized_match:
        return SimilarityLevel.NORMALIZED_DUP
    if content_ratio >= SIMILARITY_THRESHOLDS[SimilarityLevel.NEAR_DUPLICATE]:
        return SimilarityLevel.NEAR_DUPLICATE
    if content_ratio >= SIMILARITY_THRESHOLDS[SimilarityLevel.SIMILAR]:
        return SimilarityLevel.SIMILAR
    if content_ratio >= SIMILARITY_THRESHOLDS[SimilarityLevel.RELATED]:
        return SimilarityLevel.RELATED
    return SimilarityLevel.DIFFERENT

def get_recommendation(self, level: str, bytes_a: int, bytes_b: int) -> str:
    """Generate action recommendation based on similarity level."""
    recommendations = {
        SimilarityLevel.IDENTICAL: "DELETE_ONE: Files are byte-identical. Safe to delete either.",
        SimilarityLevel.NORMALIZED_DUP: "DELETE_ONE: Same content, only whitespace differs. Safe to delete either.",
        SimilarityLevel.NEAR_DUPLICATE: "REVIEW_DELETE: Very similar (>95%). Review briefly, then delete one.",
        SimilarityLevel.SIMILAR: "MERGE_RECOMMENDED: Significant overlap (70-95%). Smart merge recommended.",
        SimilarityLevel.RELATED: "REVIEW_NEEDED: Some overlap (50-70%). Manual review before action.",
        SimilarityLevel.DIFFERENT: "NO_ACTION: Files are different (<50% similar).",
    }

    rec = recommendations.get(level, "UNKNOWN")

    # Add size guidance for non-identical files
    if level in [SimilarityLevel.NEAR_DUPLICATE, SimilarityLevel.SIMILAR]:
        if bytes_a > bytes_b * 1.2:
            rec += f" Keep A (larger by {bytes_a - bytes_b} bytes)."
        elif bytes_b > bytes_a * 1.2:
            rec += f" Keep B (larger by {bytes_b - bytes_a} bytes)."

    return rec

def analyze_similarity_detailed(self, file_a: Path, file_b: Path) -> SimilarityResult:
    """
    Perform detailed similarity analysis using multiple methods:
    1. Raw hash (byte-identical)
    2. Normalized hash (whitespace-insensitive)
    3. Content ratio (difflib SequenceMatcher)
    """
    content_a = file_a.read_text(encoding='utf-8')
    content_b = file_b.read_text(encoding='utf-8')

    # Compute hashes
    raw_hash_a = self.raw_hash(content_a)
    raw_hash_b = self.raw_hash(content_b)
    norm_hash_a = self.normalized_hash(content_a)
    norm_hash_b = self.normalized_hash(content_b)

    raw_match = raw_hash_a == raw_hash_b
    normalized_match = norm_hash_a == norm_hash_b

    # Only compute expensive difflib ratio if hashes don't match
    if raw_match:
        content_ratio = 1.0
    elif normalized_match:
        content_ratio = 1.0
    else:
        content_ratio = self.content_similarity(content_a, content_b)

    # Classify
    level = self.classify_similarity(raw_match, normalized_match, content_ratio)
    recommendation = self.get_recommendation(
        level, len(content_a.encode('utf-8')), len(content_b.encode('utf-8'))
    )

    return SimilarityResult(
        file_a=file_a,
        file_b=file_b,
        level=level,
        raw_hash_match=raw_match,
        normalized_hash_match=normalized_match,
        content_ratio=content_ratio,
        raw_hash_a=raw_hash_a,
        raw_hash_b=raw_hash_b,
        normalized_hash_a=norm_hash_a,
        normalized_hash_b=norm_hash_b,
        lines_a=len(content_a.split('\n')),
        lines_b=len(content_b.split('\n')),
        bytes_a=len(content_a.encode('utf-8')),
        bytes_b=len(content_b.encode('utf-8')),
        recommendation=recommendation
    )

def extract_sections(self, content: str, source_name: str = "") -> List[Section]:
    """Extract sections from markdown content."""
    sections = []
    lines = content.split('\n')

    current_section = None
    section_lines = []
    section_start = 0

    for i, line in enumerate(lines):
        match = self.HEADING_PATTERN.match(line)
        if match:
            # Save previous section
            if current_section:
                current_section.content = '\n'.join(section_lines).strip()
                current_section.line_end = i - 1
                sections.append(current_section)

            # Start new section
            level = len(match.group(1))
            title = match.group(2).strip()
            current_section = Section(
                level=level,
                title=title,
                content="",
                line_start=i,
                line_end=i
            )
            section_lines = []
            section_start = i + 1
        else:
            section_lines.append(line)

    # Don't forget last section
    if current_section:
        current_section.content = '\n'.join(section_lines).strip()
        current_section.line_end = len(lines) - 1
        sections.append(current_section)

    return sections

def extract_metadata(self, content: str) -> Dict[str, str]:
    """Extract document metadata (version, date, status, author)."""
    metadata = {}
    for key, pattern in self.METADATA_PATTERNS.items():
        match = pattern.search(content[:2000])  # Check first 2KB
        if match:
            metadata[key] = match.group(1).strip()
    return metadata

def compare_sections(
    self,
    sections_a: List[Section],
    sections_b: List[Section]
) -> Dict[str, Any]:
    """Compare sections between two documents."""
    titles_a = {s.title: s for s in sections_a}
    titles_b = {s.title: s for s in sections_b}

    all_titles = set(titles_a.keys()) | set(titles_b.keys())

    comparison = {
        'only_in_a': [],
        'only_in_b': [],
        'identical': [],
        'different': [],
    }

    for title in all_titles:
        in_a = title in titles_a
        in_b = title in titles_b

        if in_a and not in_b:
            comparison['only_in_a'].append(titles_a[title])
        elif in_b and not in_a:
            comparison['only_in_b'].append(titles_b[title])
        else:
            # In both - check if identical
            sec_a = titles_a[title]
            sec_b = titles_b[title]

            if sec_a.hash == sec_b.hash:
                comparison['identical'].append((sec_a, sec_b))
            else:
                comparison['different'].append((sec_a, sec_b))

    return comparison

def analyze_with_llm(
    self,
    content_a: str,
    content_b: str,
    section_title: str,
    source_a: str,
    source_b: str
) -> Tuple[str, str]:
    """Use LLM to analyze and merge conflicting sections."""
    if not self.client:
        return content_a, "LLM not available, kept version A"

    prompt = f"""You are analyzing two versions of a document section that need to be merged.

SECTION TITLE: {section_title}

VERSION A (from {source_a}):

{content_a[:3000]}

VERSION B (from {source_b}):

{content_b[:3000]}

TASK: Analyze these versions and produce a merged result that:

Preserves ALL unique information from both versions
Uses the more detailed/accurate version when they conflict
Maintains consistent formatting
Keeps the most recent metadata (dates, versions)

Respond with JSON: {{ "merged_content": "The merged section content...", "reasoning": "Brief explanation of merge decisions", "confidence": 0.95 }} """

    try:
        response = self.client.messages.create(
            model=self.model,
            max_tokens=4096,
            messages=[{"role": "user", "content": prompt}]
        )

        # Parse JSON response
        response_text = response.content[0].text
        # Extract JSON from response (handle markdown code blocks)
        json_match = re.search(r'\{[\s\S]*\}', response_text)
        if json_match:
            result = json.loads(json_match.group())
            return result.get('merged_content', content_a), result.get('reasoning', 'LLM merge')

    except Exception as e:
        print(f"LLM analysis error: {e}")

    return content_a, "LLM analysis failed, kept version A"

def merge_metadata_smart(
    self,
    meta_a: Dict[str, str],
    meta_b: Dict[str, str]
) -> Dict[str, str]:
    """Smartly merge document metadata."""
    merged = {}

    # Version: use highest
    if 'version' in meta_a or 'version' in meta_b:
        v_a = meta_a.get('version', '0.0.0')
        v_b = meta_b.get('version', '0.0.0')
        try:
            merged['version'] = max(v_a, v_b, key=lambda v: list(map(int, v.split('.')[:3] + ['0', '0', '0'])))
        except ValueError:
            merged['version'] = v_a or v_b

    # Date: use most recent
    if 'date' in meta_a or 'date' in meta_b:
        d_a = meta_a.get('date', '1970-01-01')
        d_b = meta_b.get('date', '1970-01-01')
        merged['date'] = max(d_a, d_b)

    # Status: prefer "Implemented" > "Accepted" > "Draft" > "Proposed"
    status_priority = {'implemented': 4, 'accepted': 3, 'draft': 2, 'proposed': 1}
    if 'status' in meta_a or 'status' in meta_b:
        s_a = meta_a.get('status', 'draft').lower()
        s_b = meta_b.get('status', 'draft').lower()
        if status_priority.get(s_a, 0) >= status_priority.get(s_b, 0):
            merged['status'] = meta_a.get('status', meta_b.get('status', 'Draft'))
        else:
            merged['status'] = meta_b.get('status', 'Draft')

    # Author: merge unique authors
    if 'author' in meta_a or 'author' in meta_b:
        authors = set()
        if 'author' in meta_a:
            authors.add(meta_a['author'])
        if 'author' in meta_b:
            authors.add(meta_b['author'])
        merged['author'] = ', '.join(sorted(authors))

    return merged

def merge_documents(
    self,
    file_a: Path,
    file_b: Path,
    output_file: Optional[Path] = None,
    strategy: str = "smart"  # "smart", "prefer_a", "prefer_b", "longer"
) -> MergeResult:
    """Merge two documents intelligently."""

    content_a = file_a.read_text(encoding='utf-8')
    content_b = file_b.read_text(encoding='utf-8')

    # Quick check: identical files (use raw hash)
    if self.raw_hash(content_a) == self.raw_hash(content_b):
        return MergeResult(
            success=True,
            merged_content=content_a,
            stats={
                'status': 'IDENTICAL',
                'action': 'no_merge_needed',
                'message': 'Files are byte-identical'
            }
        )

    # Check normalized hash
    if self.normalized_hash(content_a) == self.normalized_hash(content_b):
        return MergeResult(
            success=True,
            merged_content=content_a,
            stats={
                'status': 'NORMALIZED_DUP',
                'action': 'no_merge_needed',
                'message': 'Files have identical content (whitespace differs)'
            }
        )

    # Extract structure
    sections_a = self.extract_sections(content_a, file_a.name)
    sections_b = self.extract_sections(content_b, file_b.name)
    meta_a = self.extract_metadata(content_a)
    meta_b = self.extract_metadata(content_b)

    # Compare sections
    comparison = self.compare_sections(sections_a, sections_b)

    # Build merged document
    merged_sections = []
    conflicts = []
    llm_analyses = []

    # 1. Handle sections only in A
    for section in comparison['only_in_a']:
        merged_sections.append((section.level, section.title, section.content))

    # 2. Handle sections only in B
    for section in comparison['only_in_b']:
        merged_sections.append((section.level, section.title, section.content))

    # 3. Handle identical sections (keep one)
    for sec_a, sec_b in comparison['identical']:
        merged_sections.append((sec_a.level, sec_a.title, sec_a.content))

    # 4. Handle different sections (resolve conflicts)
    for sec_a, sec_b in comparison['different']:
        conflict = MergeConflict(
            section_title=sec_a.title,
            content_a=sec_a.content,
            content_b=sec_b.content,
            source_a=file_a.name,
            source_b=file_b.name
        )

        if strategy == "prefer_a":
            merged_content = sec_a.content
            reason = "Strategy: prefer_a"
        elif strategy == "prefer_b":
            merged_content = sec_b.content
            reason = "Strategy: prefer_b"
        elif strategy == "longer":
            if len(sec_a.content) >= len(sec_b.content):
                merged_content = sec_a.content
                reason = f"Strategy: longer (A={len(sec_a.content)}, B={len(sec_b.content)})"
            else:
                merged_content = sec_b.content
                reason = f"Strategy: longer (A={len(sec_a.content)}, B={len(sec_b.content)})"
        elif strategy == "smart" and self.use_llm:
            merged_content, reason = self.analyze_with_llm(
                sec_a.content, sec_b.content,
                sec_a.title, file_a.name, file_b.name
            )
            llm_analyses.append({
                'section': sec_a.title,
                'reason': reason
            })
        else:
            # Default: prefer longer version
            if len(sec_a.content) >= len(sec_b.content):
                merged_content = sec_a.content
                reason = "Default: kept longer version (A)"
            else:
                merged_content = sec_b.content
                reason = "Default: kept longer version (B)"

        conflict.resolution = merged_content
        conflict.resolution_reason = reason
        conflicts.append(conflict)

        merged_sections.append((sec_a.level, sec_a.title, merged_content))

    # Sort sections by original order (approximation based on level)
    # Keep top-level sections first, then subsections
    merged_sections.sort(key=lambda x: (x[0], x[1]))

    # Rebuild document
    merged_meta = self.merge_metadata_smart(meta_a, meta_b)

    # Generate merged content
    lines = []

    # Add header if we have metadata
    if merged_meta:
        lines.append("---")
        for key, value in merged_meta.items():
            lines.append(f"{key.title()}: {value}")
        lines.append("---")
        lines.append("")

    # Add sections
    for level, title, content in merged_sections:
        lines.append(f"{'#' * level} {title}")
        lines.append("")
        lines.append(content)
        lines.append("")

    # Add merge footer
    lines.append("---")
    lines.append("")
    lines.append(f"*Merged by CODITECT Smart Merge on {datetime.now(timezone.utc).isoformat()}*")
    lines.append(f"*Sources: {file_a.name}, {file_b.name}*")

    merged_content = '\n'.join(lines)

    # Write output if specified
    if output_file:
        output_file.write_text(merged_content, encoding='utf-8')

    return MergeResult(
        success=True,
        merged_content=merged_content,
        conflicts=conflicts,
        stats={
            'sections_a': len(sections_a),
            'sections_b': len(sections_b),
            'only_in_a': len(comparison['only_in_a']),
            'only_in_b': len(comparison['only_in_b']),
            'identical': len(comparison['identical']),
            'conflicts_resolved': len(conflicts),
            'strategy': strategy,
            'llm_used': self.use_llm and strategy == "smart"
        },
        llm_analysis=json.dumps(llm_analyses, indent=2) if llm_analyses else None
    )

# Legacy method for backwards compatibility
def analyze_similarity(self, file_a: Path, file_b: Path) -> Dict[str, Any]:
    """Analyze similarity between two documents (legacy interface)."""
    result = self.analyze_similarity_detailed(file_a, file_b)

    return {
        'file_a': str(result.file_a),
        'file_b': str(result.file_b),
        'level': result.level,
        'raw_hash_match': result.raw_hash_match,
        'normalized_hash_match': result.normalized_hash_match,
        'content_ratio': round(result.content_ratio * 100, 1),
        'raw_hash_a': result.raw_hash_a[:12] + '...',
        'raw_hash_b': result.raw_hash_b[:12] + '...',
        'lines_a': result.lines_a,
        'lines_b': result.lines_b,
        'bytes_a': result.bytes_a,
        'bytes_b': result.bytes_b,
        'recommendation': result.recommendation
    }

def find_similar_files( directory: Path, pattern: str = "*.md", threshold: float = 0.5, duplicates_only: bool = False ) -> List[Dict[str, Any]]: """ Find potentially similar files in a directory.

Args:
    directory: Directory to scan
    pattern: Glob pattern for files
    threshold: Minimum similarity ratio (0.0-1.0)
    duplicates_only: If True, only return IDENTICAL and NORMALIZED_DUP

Returns:
    List of similarity results with classification
"""
files = list(directory.rglob(pattern))
results = []
merger = SmartMerger()

# Group by filename (without path)
by_name = {}
for f in files:
    name = f.name
    if name not in by_name:
        by_name[name] = []
    by_name[name].append(f)

# Check files with same name
for name, paths in by_name.items():
    if len(paths) > 1:
        for i, path_a in enumerate(paths):
            for path_b in paths[i+1:]:
                try:
                    result = merger.analyze_similarity_detailed(path_a, path_b)

                    # Filter based on mode
                    if duplicates_only:
                        if result.level not in [SimilarityLevel.IDENTICAL, SimilarityLevel.NORMALIZED_DUP]:
                            continue
                    else:
                        if result.content_ratio < threshold:
                            continue

                    results.append({
                        'file_a': str(path_a),
                        'file_b': str(path_b),
                        'level': result.level,
                        'content_ratio': round(result.content_ratio * 100, 1),
                        'raw_hash_match': result.raw_hash_match,
                        'normalized_hash_match': result.normalized_hash_match,
                        'bytes_a': result.bytes_a,
                        'bytes_b': result.bytes_b,
                        'recommendation': result.recommendation
                    })
                except Exception as e:
                    print(f"Error comparing {path_a} and {path_b}: {e}", file=sys.stderr)

return results

def main(): parser = argparse.ArgumentParser( description="CODITECT Smart Merge v2.0 - Intelligent document merging with proper similarity classification" )

subparsers = parser.add_subparsers(dest='command', help='Commands')

# Analyze command
analyze_parser = subparsers.add_parser('analyze', help='Analyze similarity between two files')
analyze_parser.add_argument('file_a', type=Path, help='First file')
analyze_parser.add_argument('file_b', type=Path, help='Second file')
analyze_parser.add_argument('--json', action='store_true', help='Output as JSON')

# Merge command
merge_parser = subparsers.add_parser('merge', help='Merge two files')
merge_parser.add_argument('file_a', type=Path, help='First file')
merge_parser.add_argument('file_b', type=Path, help='Second file')
merge_parser.add_argument('-o', '--output', type=Path, help='Output file')
merge_parser.add_argument(
    '--strategy',
    choices=['smart', 'prefer_a', 'prefer_b', 'longer'],
    default='smart',
    help='Merge strategy'
)
merge_parser.add_argument('--llm', action='store_true', help='Use LLM for conflict resolution')
merge_parser.add_argument('--dry-run', action='store_true', help='Show what would be merged')
merge_parser.add_argument('--json', action='store_true', help='Output stats as JSON')

# Find command
find_parser = subparsers.add_parser('find', help='Find similar files in directory')
find_parser.add_argument('directory', type=Path, help='Directory to scan')
find_parser.add_argument('--pattern', default='*.md', help='File pattern (default: *.md)')
find_parser.add_argument('--threshold', type=float, default=0.5, help='Similarity threshold 0-1 (default: 0.5)')
find_parser.add_argument('--duplicates-only', action='store_true',
                        help='Only show true duplicates (IDENTICAL or NORMALIZED_DUP)')
find_parser.add_argument('--json', action='store_true', help='Output as JSON')

args = parser.parse_args()

if not args.command:
    parser.print_help()
    sys.exit(1)

if args.command == 'analyze':
    merger = SmartMerger()
    result = merger.analyze_similarity(args.file_a, args.file_b)

    if args.json:
        print(json.dumps(result, indent=2))
    else:
        # Color coding for terminal
        level_colors = {
            SimilarityLevel.IDENTICAL: '\033[91m',      # Red (action needed)
            SimilarityLevel.NORMALIZED_DUP: '\033[91m', # Red
            SimilarityLevel.NEAR_DUPLICATE: '\033[93m', # Yellow
            SimilarityLevel.SIMILAR: '\033[93m',        # Yellow
            SimilarityLevel.RELATED: '\033[94m',        # Blue
            SimilarityLevel.DIFFERENT: '\033[92m',      # Green (no action)
        }
        reset = '\033[0m'
        color = level_colors.get(result['level'], '')

        print(f"\n{'='*70}")
        print(f"SMART MERGE ANALYSIS v2.0")
        print(f"{'='*70}")
        print(f"File A: {result['file_a']}")
        print(f"File B: {result['file_b']}")
        print(f"{'─'*70}")
        print(f"Bytes: A={result['bytes_a']:,}, B={result['bytes_b']:,}")
        print(f"Lines: A={result['lines_a']}, B={result['lines_b']}")
        print(f"{'─'*70}")
        print(f"Raw Hash Match:        {result['raw_hash_match']}")
        print(f"Normalized Hash Match: {result['normalized_hash_match']}")
        print(f"Content Similarity:    {result['content_ratio']}%")
        print(f"{'─'*70}")
        print(f"Classification: {color}{result['level']}{reset}")
        print(f"{'─'*70}")
        print(f"Recommendation: {result['recommendation']}")
        print(f"{'='*70}\n")

elif args.command == 'merge':
    merger = SmartMerger(use_llm=args.llm)

    if args.dry_run:
        result = merger.analyze_similarity(args.file_a, args.file_b)
        print(f"DRY RUN - Would merge:")
        print(f"  {args.file_a}")
        print(f"  {args.file_b}")
        print(f"  Strategy: {args.strategy}")
        print(f"  LLM: {args.llm}")
        print(f"  Output: {args.output or 'stdout'}")
        print(f"  Level: {result['level']}")
        print(f"  Recommendation: {result['recommendation']}")
        sys.exit(0)

    result = merger.merge_documents(
        args.file_a,
        args.file_b,
        args.output,
        args.strategy
    )

    if args.json:
        print(json.dumps({
            'success': result.success,
            'stats': result.stats,
            'conflicts': len(result.conflicts),
            'output': str(args.output) if args.output else None
        }, indent=2))
    else:
        if args.output:
            print(f"Merged document written to: {args.output}")
        else:
            print(result.merged_content)

        print(f"\nMerge Stats:")
        for key, value in result.stats.items():
            print(f"  {key}: {value}")

        if result.conflicts:
            print(f"\nResolved {len(result.conflicts)} conflicts:")
            for c in result.conflicts:
                print(f"  - {c.section_title}: {c.resolution_reason}")

elif args.command == 'find':
    results = find_similar_files(
        args.directory,
        args.pattern,
        args.threshold,
        args.duplicates_only
    )

    if args.json:
        print(json.dumps(results, indent=2))
    else:
        if not results:
            if args.duplicates_only:
                print(f"No true duplicates found (IDENTICAL or NORMALIZED_DUP)")
            else:
                print(f"No similar files found (threshold: {args.threshold*100}%)")
        else:
            # Group by level
            by_level = {}
            for r in results:
                level = r['level']
                if level not in by_level:
                    by_level[level] = []
                by_level[level].append(r)

            # Print in order of severity
            level_order = [
                SimilarityLevel.IDENTICAL,
                SimilarityLevel.NORMALIZED_DUP,
                SimilarityLevel.NEAR_DUPLICATE,
                SimilarityLevel.SIMILAR,
                SimilarityLevel.RELATED
            ]

            print(f"\n{'='*70}")
            print(f"SIMILAR FILES REPORT")
            print(f"{'='*70}")
            print(f"Directory: {args.directory}")
            print(f"Pattern: {args.pattern}")
            print(f"Total pairs found: {len(results)}")
            print(f"{'='*70}\n")

            for level in level_order:
                if level in by_level:
                    pairs = by_level[level]
                    print(f"[{level}] - {len(pairs)} pair(s)")
                    print(f"{'─'*70}")
                    for r in sorted(pairs, key=lambda x: -x['content_ratio']):
                        print(f"  {r['content_ratio']}% | {r['file_a']}")
                        print(f"       | {r['file_b']}")
                        if r['raw_hash_match']:
                            print(f"       | Hash: MATCH (true duplicate)")
                        print()
                    print()

if name == 'main': main()

#!/usr/bin/env python3 """​

Optional: Claude API for LLM analysis

Similarity classification thresholds

#!/usr/bin/env python3 """