Skip to main content

scripts-smart-merge

#!/usr/bin/env python3 """

title: "Optional: Claude API for LLM analysis" component_type: script version: "1.0.0" audience: contributor status: stable summary: "CODITECT Smart Merge - Intelligent document merging with LLM analysis." keywords: ['analysis', 'api', 'merge', 'review', 'smart'] tokens: ~500 created: 2025-12-22 updated: 2025-12-22 script_name: "smart-merge.py" language: python executable: true usage: "python3 scripts/smart-merge.py [options]" python_version: "3.10+" dependencies: [] modifies_files: false network_access: false requires_auth: false

CODITECT Smart Merge - Intelligent document merging with LLM analysis.

Merges similar documents (ADRs, markdown, code) using structural analysis and optional LLM-powered diff analysis for conflict resolution.

Similarity Classification: IDENTICAL - Raw hash match (byte-for-byte identical) NORMALIZED_DUP - Same content, different whitespace NEAR_DUPLICATE - >95% content similarity (difflib) SIMILAR - 70-95% content similarity RELATED - 50-70% content similarity

Author: AZ1.AI INC (Hal Casteel) Version: 2.0.0 Date: 2025-12-11 """

import argparse import hashlib import json import os import re import subprocess import sys from dataclasses import dataclass, field from datetime import datetime, timezone from difflib import SequenceMatcher from pathlib import Path from typing import Any, Dict, List, Optional, Tuple

Optional: Claude API for LLM analysis

try: import anthropic HAS_ANTHROPIC = True except ImportError: HAS_ANTHROPIC = False

Similarity classification thresholds

class SimilarityLevel: IDENTICAL = "IDENTICAL" # Hash match NORMALIZED_DUP = "NORMALIZED_DUP" # Normalized hash match NEAR_DUPLICATE = "NEAR_DUPLICATE" # >95% content match SIMILAR = "SIMILAR" # 70-95% content match RELATED = "RELATED" # 50-70% content match DIFFERENT = "DIFFERENT" # <50% content match

SIMILARITY_THRESHOLDS = { SimilarityLevel.NEAR_DUPLICATE: 0.95, SimilarityLevel.SIMILAR: 0.70, SimilarityLevel.RELATED: 0.50, }

@dataclass class Section: """Represents a document section.""" level: int # Heading level (1-6) title: str content: str line_start: int line_end: int hash: str = ""

def __post_init__(self):
self.hash = hashlib.sha256(
f"{self.title}:{self.content}".encode()
).hexdigest()[:16]

@dataclass class MergeConflict: """Represents a merge conflict between sections.""" section_title: str content_a: str content_b: str source_a: str source_b: str resolution: Optional[str] = None resolution_reason: Optional[str] = None

@dataclass class MergeResult: """Result of a merge operation.""" success: bool merged_content: str conflicts: List[MergeConflict] = field(default_factory=list) stats: Dict[str, Any] = field(default_factory=dict) llm_analysis: Optional[str] = None

@dataclass class SimilarityResult: """Result of similarity analysis between two files.""" file_a: Path file_b: Path level: str # SimilarityLevel classification raw_hash_match: bool # Byte-identical normalized_hash_match: bool # Content-identical (whitespace normalized) content_ratio: float # difflib SequenceMatcher ratio (0.0-1.0) raw_hash_a: str raw_hash_b: str normalized_hash_a: str normalized_hash_b: str lines_a: int lines_b: int bytes_a: int bytes_b: int recommendation: str

class SmartMerger: """Intelligent document merger with LLM support."""

HEADING_PATTERN = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE)
METADATA_PATTERNS = {
'version': re.compile(r'[Vv]ersion[:\s]+(\d+\.\d+\.?\d*)'),
'date': re.compile(r'[Dd]ate[:\s]+(\d{4}-\d{2}-\d{2})'),
'status': re.compile(r'[Ss]tatus[:\s]+(\w+)'),
'author': re.compile(r'[Aa]uthor[:\s]+(.+)'),
}

def __init__(self, use_llm: bool = False, model: str = "claude-sonnet-4-20250514"):
self.use_llm = use_llm and HAS_ANTHROPIC
self.model = model
self.client = None

if self.use_llm:
api_key = os.environ.get('ANTHROPIC_API_KEY')
if api_key:
self.client = anthropic.Anthropic(api_key=api_key)
else:
print("Warning: ANTHROPIC_API_KEY not set, LLM analysis disabled")
self.use_llm = False

def raw_hash(self, content: str) -> str:
"""Generate SHA-256 hash of raw content (byte-identical check)."""
return hashlib.sha256(content.encode('utf-8')).hexdigest()

def normalized_hash(self, content: str) -> str:
"""Generate SHA-256 hash of normalized content (whitespace-insensitive)."""
# Normalize: collapse all whitespace to single space, strip, lowercase
normalized = re.sub(r'\s+', ' ', content.strip().lower())
return hashlib.sha256(normalized.encode('utf-8')).hexdigest()

def content_similarity(self, content_a: str, content_b: str) -> float:
"""
Calculate content similarity using difflib SequenceMatcher.
Returns ratio from 0.0 (completely different) to 1.0 (identical).
"""
return SequenceMatcher(None, content_a, content_b).ratio()

def classify_similarity(
self,
raw_match: bool,
normalized_match: bool,
content_ratio: float
) -> str:
"""Classify similarity level based on hash and content analysis."""
if raw_match:
return SimilarityLevel.IDENTICAL
if normalized_match:
return SimilarityLevel.NORMALIZED_DUP
if content_ratio >= SIMILARITY_THRESHOLDS[SimilarityLevel.NEAR_DUPLICATE]:
return SimilarityLevel.NEAR_DUPLICATE
if content_ratio >= SIMILARITY_THRESHOLDS[SimilarityLevel.SIMILAR]:
return SimilarityLevel.SIMILAR
if content_ratio >= SIMILARITY_THRESHOLDS[SimilarityLevel.RELATED]:
return SimilarityLevel.RELATED
return SimilarityLevel.DIFFERENT

def get_recommendation(self, level: str, bytes_a: int, bytes_b: int) -> str:
"""Generate action recommendation based on similarity level."""
recommendations = {
SimilarityLevel.IDENTICAL: "DELETE_ONE: Files are byte-identical. Safe to delete either.",
SimilarityLevel.NORMALIZED_DUP: "DELETE_ONE: Same content, only whitespace differs. Safe to delete either.",
SimilarityLevel.NEAR_DUPLICATE: "REVIEW_DELETE: Very similar (>95%). Review briefly, then delete one.",
SimilarityLevel.SIMILAR: "MERGE_RECOMMENDED: Significant overlap (70-95%). Smart merge recommended.",
SimilarityLevel.RELATED: "REVIEW_NEEDED: Some overlap (50-70%). Manual review before action.",
SimilarityLevel.DIFFERENT: "NO_ACTION: Files are different (<50% similar).",
}

rec = recommendations.get(level, "UNKNOWN")

# Add size guidance for non-identical files
if level in [SimilarityLevel.NEAR_DUPLICATE, SimilarityLevel.SIMILAR]:
if bytes_a > bytes_b * 1.2:
rec += f" Keep A (larger by {bytes_a - bytes_b} bytes)."
elif bytes_b > bytes_a * 1.2:
rec += f" Keep B (larger by {bytes_b - bytes_a} bytes)."

return rec

def analyze_similarity_detailed(self, file_a: Path, file_b: Path) -> SimilarityResult:
"""
Perform detailed similarity analysis using multiple methods:
1. Raw hash (byte-identical)
2. Normalized hash (whitespace-insensitive)
3. Content ratio (difflib SequenceMatcher)
"""
content_a = file_a.read_text(encoding='utf-8')
content_b = file_b.read_text(encoding='utf-8')

# Compute hashes
raw_hash_a = self.raw_hash(content_a)
raw_hash_b = self.raw_hash(content_b)
norm_hash_a = self.normalized_hash(content_a)
norm_hash_b = self.normalized_hash(content_b)

raw_match = raw_hash_a == raw_hash_b
normalized_match = norm_hash_a == norm_hash_b

# Only compute expensive difflib ratio if hashes don't match
if raw_match:
content_ratio = 1.0
elif normalized_match:
content_ratio = 1.0
else:
content_ratio = self.content_similarity(content_a, content_b)

# Classify
level = self.classify_similarity(raw_match, normalized_match, content_ratio)
recommendation = self.get_recommendation(
level, len(content_a.encode('utf-8')), len(content_b.encode('utf-8'))
)

return SimilarityResult(
file_a=file_a,
file_b=file_b,
level=level,
raw_hash_match=raw_match,
normalized_hash_match=normalized_match,
content_ratio=content_ratio,
raw_hash_a=raw_hash_a,
raw_hash_b=raw_hash_b,
normalized_hash_a=norm_hash_a,
normalized_hash_b=norm_hash_b,
lines_a=len(content_a.split('\n')),
lines_b=len(content_b.split('\n')),
bytes_a=len(content_a.encode('utf-8')),
bytes_b=len(content_b.encode('utf-8')),
recommendation=recommendation
)

def extract_sections(self, content: str, source_name: str = "") -> List[Section]:
"""Extract sections from markdown content."""
sections = []
lines = content.split('\n')

current_section = None
section_lines = []
section_start = 0

for i, line in enumerate(lines):
match = self.HEADING_PATTERN.match(line)
if match:
# Save previous section
if current_section:
current_section.content = '\n'.join(section_lines).strip()
current_section.line_end = i - 1
sections.append(current_section)

# Start new section
level = len(match.group(1))
title = match.group(2).strip()
current_section = Section(
level=level,
title=title,
content="",
line_start=i,
line_end=i
)
section_lines = []
section_start = i + 1
else:
section_lines.append(line)

# Don't forget last section
if current_section:
current_section.content = '\n'.join(section_lines).strip()
current_section.line_end = len(lines) - 1
sections.append(current_section)

return sections

def extract_metadata(self, content: str) -> Dict[str, str]:
"""Extract document metadata (version, date, status, author)."""
metadata = {}
for key, pattern in self.METADATA_PATTERNS.items():
match = pattern.search(content[:2000]) # Check first 2KB
if match:
metadata[key] = match.group(1).strip()
return metadata

def compare_sections(
self,
sections_a: List[Section],
sections_b: List[Section]
) -> Dict[str, Any]:
"""Compare sections between two documents."""
titles_a = {s.title: s for s in sections_a}
titles_b = {s.title: s for s in sections_b}

all_titles = set(titles_a.keys()) | set(titles_b.keys())

comparison = {
'only_in_a': [],
'only_in_b': [],
'identical': [],
'different': [],
}

for title in all_titles:
in_a = title in titles_a
in_b = title in titles_b

if in_a and not in_b:
comparison['only_in_a'].append(titles_a[title])
elif in_b and not in_a:
comparison['only_in_b'].append(titles_b[title])
else:
# In both - check if identical
sec_a = titles_a[title]
sec_b = titles_b[title]

if sec_a.hash == sec_b.hash:
comparison['identical'].append((sec_a, sec_b))
else:
comparison['different'].append((sec_a, sec_b))

return comparison

def analyze_with_llm(
self,
content_a: str,
content_b: str,
section_title: str,
source_a: str,
source_b: str
) -> Tuple[str, str]:
"""Use LLM to analyze and merge conflicting sections."""
if not self.client:
return content_a, "LLM not available, kept version A"

prompt = f"""You are analyzing two versions of a document section that need to be merged.

SECTION TITLE: {section_title}

VERSION A (from {source_a}):

{content_a[:3000]}

VERSION B (from {source_b}):

{content_b[:3000]}

TASK: Analyze these versions and produce a merged result that:

  1. Preserves ALL unique information from both versions
  2. Uses the more detailed/accurate version when they conflict
  3. Maintains consistent formatting
  4. Keeps the most recent metadata (dates, versions)

Respond with JSON: {{ "merged_content": "The merged section content...", "reasoning": "Brief explanation of merge decisions", "confidence": 0.95 }} """

    try:
response = self.client.messages.create(
model=self.model,
max_tokens=4096,
messages=[{"role": "user", "content": prompt}]
)

# Parse JSON response
response_text = response.content[0].text
# Extract JSON from response (handle markdown code blocks)
json_match = re.search(r'\{[\s\S]*\}', response_text)
if json_match:
result = json.loads(json_match.group())
return result.get('merged_content', content_a), result.get('reasoning', 'LLM merge')

except Exception as e:
print(f"LLM analysis error: {e}")

return content_a, "LLM analysis failed, kept version A"

def merge_metadata_smart(
self,
meta_a: Dict[str, str],
meta_b: Dict[str, str]
) -> Dict[str, str]:
"""Smartly merge document metadata."""
merged = {}

# Version: use highest
if 'version' in meta_a or 'version' in meta_b:
v_a = meta_a.get('version', '0.0.0')
v_b = meta_b.get('version', '0.0.0')
try:
merged['version'] = max(v_a, v_b, key=lambda v: list(map(int, v.split('.')[:3] + ['0', '0', '0'])))
except ValueError:
merged['version'] = v_a or v_b

# Date: use most recent
if 'date' in meta_a or 'date' in meta_b:
d_a = meta_a.get('date', '1970-01-01')
d_b = meta_b.get('date', '1970-01-01')
merged['date'] = max(d_a, d_b)

# Status: prefer "Implemented" > "Accepted" > "Draft" > "Proposed"
status_priority = {'implemented': 4, 'accepted': 3, 'draft': 2, 'proposed': 1}
if 'status' in meta_a or 'status' in meta_b:
s_a = meta_a.get('status', 'draft').lower()
s_b = meta_b.get('status', 'draft').lower()
if status_priority.get(s_a, 0) >= status_priority.get(s_b, 0):
merged['status'] = meta_a.get('status', meta_b.get('status', 'Draft'))
else:
merged['status'] = meta_b.get('status', 'Draft')

# Author: merge unique authors
if 'author' in meta_a or 'author' in meta_b:
authors = set()
if 'author' in meta_a:
authors.add(meta_a['author'])
if 'author' in meta_b:
authors.add(meta_b['author'])
merged['author'] = ', '.join(sorted(authors))

return merged

def merge_documents(
self,
file_a: Path,
file_b: Path,
output_file: Optional[Path] = None,
strategy: str = "smart" # "smart", "prefer_a", "prefer_b", "longer"
) -> MergeResult:
"""Merge two documents intelligently."""

content_a = file_a.read_text(encoding='utf-8')
content_b = file_b.read_text(encoding='utf-8')

# Quick check: identical files (use raw hash)
if self.raw_hash(content_a) == self.raw_hash(content_b):
return MergeResult(
success=True,
merged_content=content_a,
stats={
'status': 'IDENTICAL',
'action': 'no_merge_needed',
'message': 'Files are byte-identical'
}
)

# Check normalized hash
if self.normalized_hash(content_a) == self.normalized_hash(content_b):
return MergeResult(
success=True,
merged_content=content_a,
stats={
'status': 'NORMALIZED_DUP',
'action': 'no_merge_needed',
'message': 'Files have identical content (whitespace differs)'
}
)

# Extract structure
sections_a = self.extract_sections(content_a, file_a.name)
sections_b = self.extract_sections(content_b, file_b.name)
meta_a = self.extract_metadata(content_a)
meta_b = self.extract_metadata(content_b)

# Compare sections
comparison = self.compare_sections(sections_a, sections_b)

# Build merged document
merged_sections = []
conflicts = []
llm_analyses = []

# 1. Handle sections only in A
for section in comparison['only_in_a']:
merged_sections.append((section.level, section.title, section.content))

# 2. Handle sections only in B
for section in comparison['only_in_b']:
merged_sections.append((section.level, section.title, section.content))

# 3. Handle identical sections (keep one)
for sec_a, sec_b in comparison['identical']:
merged_sections.append((sec_a.level, sec_a.title, sec_a.content))

# 4. Handle different sections (resolve conflicts)
for sec_a, sec_b in comparison['different']:
conflict = MergeConflict(
section_title=sec_a.title,
content_a=sec_a.content,
content_b=sec_b.content,
source_a=file_a.name,
source_b=file_b.name
)

if strategy == "prefer_a":
merged_content = sec_a.content
reason = "Strategy: prefer_a"
elif strategy == "prefer_b":
merged_content = sec_b.content
reason = "Strategy: prefer_b"
elif strategy == "longer":
if len(sec_a.content) >= len(sec_b.content):
merged_content = sec_a.content
reason = f"Strategy: longer (A={len(sec_a.content)}, B={len(sec_b.content)})"
else:
merged_content = sec_b.content
reason = f"Strategy: longer (A={len(sec_a.content)}, B={len(sec_b.content)})"
elif strategy == "smart" and self.use_llm:
merged_content, reason = self.analyze_with_llm(
sec_a.content, sec_b.content,
sec_a.title, file_a.name, file_b.name
)
llm_analyses.append({
'section': sec_a.title,
'reason': reason
})
else:
# Default: prefer longer version
if len(sec_a.content) >= len(sec_b.content):
merged_content = sec_a.content
reason = "Default: kept longer version (A)"
else:
merged_content = sec_b.content
reason = "Default: kept longer version (B)"

conflict.resolution = merged_content
conflict.resolution_reason = reason
conflicts.append(conflict)

merged_sections.append((sec_a.level, sec_a.title, merged_content))

# Sort sections by original order (approximation based on level)
# Keep top-level sections first, then subsections
merged_sections.sort(key=lambda x: (x[0], x[1]))

# Rebuild document
merged_meta = self.merge_metadata_smart(meta_a, meta_b)

# Generate merged content
lines = []

# Add header if we have metadata
if merged_meta:
lines.append("---")
for key, value in merged_meta.items():
lines.append(f"{key.title()}: {value}")
lines.append("---")
lines.append("")

# Add sections
for level, title, content in merged_sections:
lines.append(f"{'#' * level} {title}")
lines.append("")
lines.append(content)
lines.append("")

# Add merge footer
lines.append("---")
lines.append("")
lines.append(f"*Merged by CODITECT Smart Merge on {datetime.now(timezone.utc).isoformat()}*")
lines.append(f"*Sources: {file_a.name}, {file_b.name}*")

merged_content = '\n'.join(lines)

# Write output if specified
if output_file:
output_file.write_text(merged_content, encoding='utf-8')

return MergeResult(
success=True,
merged_content=merged_content,
conflicts=conflicts,
stats={
'sections_a': len(sections_a),
'sections_b': len(sections_b),
'only_in_a': len(comparison['only_in_a']),
'only_in_b': len(comparison['only_in_b']),
'identical': len(comparison['identical']),
'conflicts_resolved': len(conflicts),
'strategy': strategy,
'llm_used': self.use_llm and strategy == "smart"
},
llm_analysis=json.dumps(llm_analyses, indent=2) if llm_analyses else None
)

# Legacy method for backwards compatibility
def analyze_similarity(self, file_a: Path, file_b: Path) -> Dict[str, Any]:
"""Analyze similarity between two documents (legacy interface)."""
result = self.analyze_similarity_detailed(file_a, file_b)

return {
'file_a': str(result.file_a),
'file_b': str(result.file_b),
'level': result.level,
'raw_hash_match': result.raw_hash_match,
'normalized_hash_match': result.normalized_hash_match,
'content_ratio': round(result.content_ratio * 100, 1),
'raw_hash_a': result.raw_hash_a[:12] + '...',
'raw_hash_b': result.raw_hash_b[:12] + '...',
'lines_a': result.lines_a,
'lines_b': result.lines_b,
'bytes_a': result.bytes_a,
'bytes_b': result.bytes_b,
'recommendation': result.recommendation
}

def find_similar_files( directory: Path, pattern: str = "*.md", threshold: float = 0.5, duplicates_only: bool = False ) -> List[Dict[str, Any]]: """ Find potentially similar files in a directory.

Args:
directory: Directory to scan
pattern: Glob pattern for files
threshold: Minimum similarity ratio (0.0-1.0)
duplicates_only: If True, only return IDENTICAL and NORMALIZED_DUP

Returns:
List of similarity results with classification
"""
files = list(directory.rglob(pattern))
results = []
merger = SmartMerger()

# Group by filename (without path)
by_name = {}
for f in files:
name = f.name
if name not in by_name:
by_name[name] = []
by_name[name].append(f)

# Check files with same name
for name, paths in by_name.items():
if len(paths) > 1:
for i, path_a in enumerate(paths):
for path_b in paths[i+1:]:
try:
result = merger.analyze_similarity_detailed(path_a, path_b)

# Filter based on mode
if duplicates_only:
if result.level not in [SimilarityLevel.IDENTICAL, SimilarityLevel.NORMALIZED_DUP]:
continue
else:
if result.content_ratio < threshold:
continue

results.append({
'file_a': str(path_a),
'file_b': str(path_b),
'level': result.level,
'content_ratio': round(result.content_ratio * 100, 1),
'raw_hash_match': result.raw_hash_match,
'normalized_hash_match': result.normalized_hash_match,
'bytes_a': result.bytes_a,
'bytes_b': result.bytes_b,
'recommendation': result.recommendation
})
except Exception as e:
print(f"Error comparing {path_a} and {path_b}: {e}", file=sys.stderr)

return results

def main(): parser = argparse.ArgumentParser( description="CODITECT Smart Merge v2.0 - Intelligent document merging with proper similarity classification" )

subparsers = parser.add_subparsers(dest='command', help='Commands')

# Analyze command
analyze_parser = subparsers.add_parser('analyze', help='Analyze similarity between two files')
analyze_parser.add_argument('file_a', type=Path, help='First file')
analyze_parser.add_argument('file_b', type=Path, help='Second file')
analyze_parser.add_argument('--json', action='store_true', help='Output as JSON')

# Merge command
merge_parser = subparsers.add_parser('merge', help='Merge two files')
merge_parser.add_argument('file_a', type=Path, help='First file')
merge_parser.add_argument('file_b', type=Path, help='Second file')
merge_parser.add_argument('-o', '--output', type=Path, help='Output file')
merge_parser.add_argument(
'--strategy',
choices=['smart', 'prefer_a', 'prefer_b', 'longer'],
default='smart',
help='Merge strategy'
)
merge_parser.add_argument('--llm', action='store_true', help='Use LLM for conflict resolution')
merge_parser.add_argument('--dry-run', action='store_true', help='Show what would be merged')
merge_parser.add_argument('--json', action='store_true', help='Output stats as JSON')

# Find command
find_parser = subparsers.add_parser('find', help='Find similar files in directory')
find_parser.add_argument('directory', type=Path, help='Directory to scan')
find_parser.add_argument('--pattern', default='*.md', help='File pattern (default: *.md)')
find_parser.add_argument('--threshold', type=float, default=0.5, help='Similarity threshold 0-1 (default: 0.5)')
find_parser.add_argument('--duplicates-only', action='store_true',
help='Only show true duplicates (IDENTICAL or NORMALIZED_DUP)')
find_parser.add_argument('--json', action='store_true', help='Output as JSON')

args = parser.parse_args()

if not args.command:
parser.print_help()
sys.exit(1)

if args.command == 'analyze':
merger = SmartMerger()
result = merger.analyze_similarity(args.file_a, args.file_b)

if args.json:
print(json.dumps(result, indent=2))
else:
# Color coding for terminal
level_colors = {
SimilarityLevel.IDENTICAL: '\033[91m', # Red (action needed)
SimilarityLevel.NORMALIZED_DUP: '\033[91m', # Red
SimilarityLevel.NEAR_DUPLICATE: '\033[93m', # Yellow
SimilarityLevel.SIMILAR: '\033[93m', # Yellow
SimilarityLevel.RELATED: '\033[94m', # Blue
SimilarityLevel.DIFFERENT: '\033[92m', # Green (no action)
}
reset = '\033[0m'
color = level_colors.get(result['level'], '')

print(f"\n{'='*70}")
print(f"SMART MERGE ANALYSIS v2.0")
print(f"{'='*70}")
print(f"File A: {result['file_a']}")
print(f"File B: {result['file_b']}")
print(f"{'─'*70}")
print(f"Bytes: A={result['bytes_a']:,}, B={result['bytes_b']:,}")
print(f"Lines: A={result['lines_a']}, B={result['lines_b']}")
print(f"{'─'*70}")
print(f"Raw Hash Match: {result['raw_hash_match']}")
print(f"Normalized Hash Match: {result['normalized_hash_match']}")
print(f"Content Similarity: {result['content_ratio']}%")
print(f"{'─'*70}")
print(f"Classification: {color}{result['level']}{reset}")
print(f"{'─'*70}")
print(f"Recommendation: {result['recommendation']}")
print(f"{'='*70}\n")

elif args.command == 'merge':
merger = SmartMerger(use_llm=args.llm)

if args.dry_run:
result = merger.analyze_similarity(args.file_a, args.file_b)
print(f"DRY RUN - Would merge:")
print(f" {args.file_a}")
print(f" {args.file_b}")
print(f" Strategy: {args.strategy}")
print(f" LLM: {args.llm}")
print(f" Output: {args.output or 'stdout'}")
print(f" Level: {result['level']}")
print(f" Recommendation: {result['recommendation']}")
sys.exit(0)

result = merger.merge_documents(
args.file_a,
args.file_b,
args.output,
args.strategy
)

if args.json:
print(json.dumps({
'success': result.success,
'stats': result.stats,
'conflicts': len(result.conflicts),
'output': str(args.output) if args.output else None
}, indent=2))
else:
if args.output:
print(f"Merged document written to: {args.output}")
else:
print(result.merged_content)

print(f"\nMerge Stats:")
for key, value in result.stats.items():
print(f" {key}: {value}")

if result.conflicts:
print(f"\nResolved {len(result.conflicts)} conflicts:")
for c in result.conflicts:
print(f" - {c.section_title}: {c.resolution_reason}")

elif args.command == 'find':
results = find_similar_files(
args.directory,
args.pattern,
args.threshold,
args.duplicates_only
)

if args.json:
print(json.dumps(results, indent=2))
else:
if not results:
if args.duplicates_only:
print(f"No true duplicates found (IDENTICAL or NORMALIZED_DUP)")
else:
print(f"No similar files found (threshold: {args.threshold*100}%)")
else:
# Group by level
by_level = {}
for r in results:
level = r['level']
if level not in by_level:
by_level[level] = []
by_level[level].append(r)

# Print in order of severity
level_order = [
SimilarityLevel.IDENTICAL,
SimilarityLevel.NORMALIZED_DUP,
SimilarityLevel.NEAR_DUPLICATE,
SimilarityLevel.SIMILAR,
SimilarityLevel.RELATED
]

print(f"\n{'='*70}")
print(f"SIMILAR FILES REPORT")
print(f"{'='*70}")
print(f"Directory: {args.directory}")
print(f"Pattern: {args.pattern}")
print(f"Total pairs found: {len(results)}")
print(f"{'='*70}\n")

for level in level_order:
if level in by_level:
pairs = by_level[level]
print(f"[{level}] - {len(pairs)} pair(s)")
print(f"{'─'*70}")
for r in sorted(pairs, key=lambda x: -x['content_ratio']):
print(f" {r['content_ratio']}% | {r['file_a']}")
print(f" | {r['file_b']}")
if r['raw_hash_match']:
print(f" | Hash: MATCH (true duplicate)")
print()
print()

if name == 'main': main()