scripts-fix-md040-code-fence-language

#!/usr/bin/env python3 """

title: "Fix Md040 Code Fence Language" component_type: script version: "1.0.0" audience: contributor status: stable summary: "MD040 Code Fence Language Auto-Fixer (Non-Breaking)" keywords: ['code', 'docker', 'fence', 'fix', 'git'] tokens: ~500 created: 2025-12-22 updated: 2025-12-22 script_name: "fix-md040-code-fence-language.py" language: python executable: true usage: "python3 scripts/fix-md040-code-fence-language.py [options]" python_version: "3.10+" dependencies: [] modifies_files: false network_access: false requires_auth: false

MD040 Code Fence Language Auto-Fixer (Non-Breaking)

Detects code blocks without language specifiers and suggests appropriate languages based on content heuristics.

Safety Features:

Dry-run mode (preview changes before applying)
Automatic backups before modifications
Git working tree validation
Rollback capability
Before/after validation reports

Usage: # DRY RUN: Preview changes (recommended first step) python3 scripts/fix-md040-code-fence-language.py --detect --dry-run

# Detect and generate suggestions
python3 scripts/fix-md040-code-fence-language.py --detect

# Auto-apply high-confidence suggestions with backup
python3 scripts/fix-md040-code-fence-language.py --auto-apply --min-confidence 90 --backup

# Rollback changes
python3 scripts/fix-md040-code-fence-language.py --rollback backup-TIMESTAMP

"""

import re import json import argparse import shutil import subprocess from pathlib import Path from datetime import datetime from typing import Dict, List, Tuple, Optional from dataclasses import dataclass, asdict

@dataclass class CodeBlock: """Represents a code block without language specifier.""" file_path: str line_number: int content: str suggested_language: str confidence: int # 0-100 reasoning: str

class LanguageDetector: """Intelligent language detection based on code content."""

def __init__(self):
    self.patterns = {
        'bash': [
            (r'^#!/bin/(ba)?sh', 95),
            (r'^\s*(export|source|alias)\s+', 85),
            (r'\$\{?[A-Z_]+\}?', 70),
            (r'(echo|cd|ls|mkdir|rm|mv|cp)\s+', 65),
        ],
        'python': [
            (r'^#!/usr/bin/(env\s+)?python', 95),
            (r'^\s*(import|from)\s+\w+', 90),
            (r'^\s*def\s+\w+\(', 85),
            (r'^\s*class\s+\w+[:(]', 85),
            (r'(print|range|len|str|int|list|dict)\(', 75),
        ],
        'javascript': [
            (r'^\s*(const|let|var)\s+\w+\s*=', 85),
            (r'^\s*function\s+\w+\(', 85),
            (r'^\s*(async|await)\s+', 80),
            (r'(console\.(log|error|warn)|require\(|module\.exports)', 80),
            (r'=>\s*{', 75),
        ],
        'typescript': [
            (r'^\s*interface\s+\w+\s*{', 90),
            (r'^\s*type\s+\w+\s*=', 85),
            (r':\s*(string|number|boolean|any|void)', 80),
            (r'(React\.|useState|useEffect)', 75),
        ],
        'json': [
            (r'^\s*\{[\s\n]*"[\w-]+":', 95),
            (r'^\s*\[[\s\n]*\{', 90),
            (r'"[\w-]+"\s*:\s*(".*?"|[\d.]+|true|false|null)', 85),
        ],
        'yaml': [
            (r'^\s*[\w-]+:\s*([\w-]+|"[^"]*")?\s*$', 85),
            (r'^\s*-\s+[\w-]+:', 80),
            (r'^\s*---\s*$', 90),
        ],
        'markdown': [
            (r'^#+\s+', 85),
            (r'\[.*?\]\(.*?\)', 75),
            (r'^>\s+', 70),
        ],
        'sql': [
            (r'^\s*(SELECT|INSERT|UPDATE|DELETE|CREATE|ALTER|DROP)\s+', 90),
            (r'^\s*(FROM|WHERE|JOIN|GROUP BY|ORDER BY)\s+', 85),
        ],
        'html': [
            (r'^\s*<(!DOCTYPE|html|head|body|div|span|p|a)', 90),
            (r'<\w+(\s+[\w-]+="[^"]*")*>', 80),
        ],
        'css': [
            (r'^\s*[\w.-]+\s*\{', 85),
            (r'^\s*[\w-]+\s*:\s*[^;]+;', 80),
        ],
        'rust': [
            (r'^\s*fn\s+\w+\(', 90),
            (r'^\s*(pub\s+)?(struct|enum|impl)\s+', 85),
            (r'(let|mut|&str|Vec<)', 75),
        ],
        'go': [
            (r'^\s*func\s+(\w+\s*)?\(', 90),
            (r'^\s*package\s+\w+', 95),
            (r'(import|var|const|type|interface)\s+', 80),
        ],
        'java': [
            (r'^\s*(public|private|protected)\s+(class|interface|enum)', 90),
            (r'^\s*(public|private|protected)\s+\w+\s+\w+\(', 85),
            (r'(System\.out\.|import\s+java\.)', 80),
        ],
        'c': [
            (r'^\s*#include\s+<[\w./]+>', 90),
            (r'^\s*(int|void|char|float|double)\s+\w+\(', 85),
        ],
        'cpp': [
            (r'^\s*#include\s+<[\w./]+>', 85),
            (r'(std::|using namespace|template<)', 90),
        ],
        'dockerfile': [
            (r'^\s*FROM\s+[\w/:.-]+', 95),
            (r'^\s*(RUN|CMD|ENTRYPOINT|COPY|ADD|WORKDIR|ENV)\s+', 90),
        ],
        'makefile': [
            (r'^\w+:\s*$', 85),
            (r'^\t', 75),
            (r'\$\([A-Z_]+\)', 70),
        ],
        'xml': [
            (r'^\s*<\?xml', 95),
            (r'<\w+(\s+[\w:]+="[^"]*")*>', 80),
        ],
        'toml': [
            (r'^\s*\[[\w.-]+\]', 90),
            (r'^\s*[\w-]+\s*=\s*', 80),
        ],
        'ini': [
            (r'^\s*\[[\w\s]+\]', 85),
            (r'^\s*[\w]+\s*=\s*', 75),
        ],
    }

def detect(self, code: str) -> Tuple[str, int, str]:
    """
    Detect language from code content.

    Returns:
        (language, confidence, reasoning)
    """
    scores = {}
    reasons = {}

    for lang, patterns in self.patterns.items():
        max_score = 0
        matching_patterns = []

        for pattern, confidence in patterns:
            if re.search(pattern, code, re.MULTILINE | re.IGNORECASE):
                max_score = max(max_score, confidence)
                matching_patterns.append(f"{pattern[:30]}...")

        if max_score > 0:
            scores[lang] = max_score
            reasons[lang] = f"Matched patterns: {', '.join(matching_patterns[:2])}"

    # No matches - use 'text' as safe default
    if not scores:
        return 'text', 50, 'No specific patterns detected, using generic text'

    # Get highest scoring language
    best_lang = max(scores.items(), key=lambda x: x[1])

    return best_lang[0], best_lang[1], reasons[best_lang[0]]

class SafetyManager: """Manages safety features: backups, git checks, rollbacks."""

@staticmethod
def check_git_clean(repo_root: Path) -> bool:
    """Check if git working tree is clean."""
    try:
        result = subprocess.run(
            ['git', 'status', '--porcelain'],
            cwd=repo_root,
            capture_output=True,
            text=True
        )
        return result.returncode == 0 and len(result.stdout.strip()) == 0
    except Exception:
        return False

@staticmethod
def create_backup(files: List[Path], backup_dir: Path) -> bool:
    """Create backup of files before modification."""
    try:
        backup_dir.mkdir(parents=True, exist_ok=True)

        for file_path in files:
            # Preserve directory structure in backup
            rel_path = file_path.relative_to(file_path.parents[len(list(file_path.parents)) - 1])
            backup_file = backup_dir / rel_path
            backup_file.parent.mkdir(parents=True, exist_ok=True)
            shutil.copy2(file_path, backup_file)

        # Save manifest
        manifest = {
            'timestamp': datetime.now().isoformat(),
            'files': [str(f) for f in files]
        }
        (backup_dir / 'manifest.json').write_text(json.dumps(manifest, indent=2))

        return True
    except Exception as e:
        print(f"Backup failed: {e}")
        return False

@staticmethod
def rollback(backup_dir: Path) -> bool:
    """Restore files from backup."""
    try:
        manifest_file = backup_dir / 'manifest.json'
        if not manifest_file.exists():
            print(f"Error: No manifest found in {backup_dir}")
            return False

        manifest = json.loads(manifest_file.read_text())

        for file_str in manifest['files']:
            source = Path(file_str)
            rel_path = source.relative_to(source.parents[len(list(source.parents)) - 1])
            backup_file = backup_dir / rel_path

            if backup_file.exists():
                shutil.copy2(backup_file, source)
                print(f"  ✓ Restored {source}")

        return True
    except Exception as e:
        print(f"Rollback failed: {e}")
        return False

class MD040Fixer: """Fixes MD040 violations by adding language specifiers to code blocks."""

def __init__(self, repo_root: Path, dry_run: bool = False):
    self.repo_root = repo_root
    self.dry_run = dry_run
    self.detector = LanguageDetector()
    self.code_block_pattern = re.compile(r'^```\s*$', re.MULTILINE)

def find_violations(self, file_path: Path) -> List[CodeBlock]:
    """Find all code blocks without language specifiers in a file."""
    violations = []

    try:
        content = file_path.read_text(encoding='utf-8')
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return violations

    lines = content.split('\n')
    i = 0

    while i < len(lines):
        # Check for code fence without language
        if lines[i].strip() == '```':
            # Found code block without language
            start_line = i + 1  # Line number (1-indexed)
            code_lines = []
            i += 1

            # Collect code block content
            while i < len(lines) and lines[i].strip() != '```':
                code_lines.append(lines[i])
                i += 1

            code_content = '\n'.join(code_lines)

            # Detect language
            lang, confidence, reasoning = self.detector.detect(code_content)

            violations.append(CodeBlock(
                file_path=str(file_path.relative_to(self.repo_root)),
                line_number=start_line,
                content=code_content[:200],  # First 200 chars
                suggested_language=lang,
                confidence=confidence,
                reasoning=reasoning
            ))

        i += 1

    return violations

def scan_repository(self, pattern: str = "**/*.md") -> List[CodeBlock]:
    """Scan repository for MD040 violations."""
    all_violations = []

    for md_file in self.repo_root.glob(pattern):
        if md_file.is_file():
            violations = self.find_violations(md_file)
            all_violations.extend(violations)

    return all_violations

def apply_fix(self, file_path: Path, line_number: int, language: str) -> bool:
    """Apply fix to a specific code block."""
    if self.dry_run:
        # Dry run: just validate the fix would work
        try:
            content = file_path.read_text(encoding='utf-8')
            lines = content.split('\n')
            idx = line_number - 2

            if idx >= 0 and idx < len(lines) and lines[idx].strip() == '```':
                print(f"  [DRY RUN] Would fix {file_path}:{line_number} → {language}")
                return True
        except Exception as e:
            print(f"  [DRY RUN] Cannot fix {file_path}:{line_number}: {e}")

        return False

    # Actual fix
    try:
        content = file_path.read_text(encoding='utf-8')
        lines = content.split('\n')

        # Find the code fence at line_number - 1 (0-indexed)
        idx = line_number - 2

        if idx >= 0 and idx < len(lines) and lines[idx].strip() == '```':
            lines[idx] = f'```{language}'

            # Write back
            file_path.write_text('\n'.join(lines), encoding='utf-8')
            return True
    except Exception as e:
        print(f"Error applying fix to {file_path}:{line_number}: {e}")

    return False

def main(): parser = argparse.ArgumentParser( description='Fix MD040 code fence language violations (Non-Breaking)', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=''' Examples:

Dry run (safe preview)

python3 %(prog)s --detect --dry-run

Detect and generate suggestions

python3 %(prog)s --detect

Auto-apply high-confidence with backup

python3 %(prog)s --auto-apply --min-confidence 90 --backup

Rollback changes

python3 %(prog)s --rollback backups/md040-backup-20251206-143022 ''' )

parser.add_argument('--detect', action='store_true',
                   help='Detect violations and generate suggestions')
parser.add_argument('--apply', type=str, metavar='FILE',
                   help='Apply fixes from suggestions file')
parser.add_argument('--auto-apply', action='store_true',
                   help='Automatically apply high-confidence suggestions')
parser.add_argument('--rollback', type=str, metavar='BACKUP_DIR',
                   help='Rollback changes from backup directory')
parser.add_argument('--dry-run', action='store_true',
                   help='Preview changes without modifying files')
parser.add_argument('--backup', action='store_true',
                   help='Create backup before applying fixes')
parser.add_argument('--require-clean-git', action='store_true',
                   help='Require clean git working tree before applying')
parser.add_argument('--min-confidence', type=int, default=80,
                   help='Minimum confidence for auto-apply (default: 80)')
parser.add_argument('--output', type=str, default='md040-suggestions.json',
                   help='Output file for suggestions (default: md040-suggestions.json)')
parser.add_argument('--repo-root', type=str, default='.',
                   help='Repository root directory (default: current directory)')

args = parser.parse_args()
repo_root = Path(args.repo_root).resolve()

# Handle rollback
if args.rollback:
    backup_dir = Path(args.rollback)
    print(f"Rolling back changes from {backup_dir}...")

    if SafetyManager.rollback(backup_dir):
        print("\n✅ Rollback completed successfully")
    else:
        print("\n❌ Rollback failed")

    return

# Safety check: git working tree
if args.require_clean_git and not SafetyManager.check_git_clean(repo_root):
    print("❌ Error: Git working tree is not clean")
    print("   Commit or stash changes first, or remove --require-clean-git flag")
    return

fixer = MD040Fixer(repo_root, dry_run=args.dry_run)

if args.detect or args.auto_apply:
    print("Scanning repository for MD040 violations...")
    violations = fixer.scan_repository()

    print(f"\nFound {len(violations)} code blocks without language specifiers")

    # Group by confidence
    high_confidence = [v for v in violations if v.confidence >= 90]
    medium_confidence = [v for v in violations if 70 <= v.confidence < 90]
    low_confidence = [v for v in violations if v.confidence < 70]

    print(f"  High confidence (≥90%): {len(high_confidence)}")
    print(f"  Medium confidence (70-89%): {len(medium_confidence)}")
    print(f"  Low confidence (<70%): {len(low_confidence)}")

    # Save suggestions
    output_file = repo_root / args.output
    with output_file.open('w', encoding='utf-8') as f:
        json.dump([asdict(v) for v in violations], f, indent=2)

    print(f"\nSuggestions saved to: {output_file}")

    if args.auto_apply:
        auto_apply_violations = [v for v in violations if v.confidence >= args.min_confidence]

        if args.dry_run:
            print(f"\n[DRY RUN] Would apply {len(auto_apply_violations)} fixes with confidence ≥{args.min_confidence}%...")
        else:
            # Create backup if requested
            if args.backup:
                affected_files = list(set([repo_root / v.file_path for v in auto_apply_violations]))
                timestamp = datetime.now().strftime('%Y%m%d-%H%M%S')
                backup_dir = repo_root / 'backups' / f'md040-backup-{timestamp}'

                print(f"\nCreating backup of {len(affected_files)} files...")
                if not SafetyManager.create_backup(affected_files, backup_dir):
                    print("❌ Backup failed - aborting")
                    return

                print(f"✅ Backup created: {backup_dir}")

            print(f"\nApplying {len(auto_apply_violations)} fixes with confidence ≥{args.min_confidence}%...")

        applied = 0
        for violation in auto_apply_violations:
            file_path = repo_root / violation.file_path
            if fixer.apply_fix(file_path, violation.line_number, violation.suggested_language):
                applied += 1

        if args.dry_run:
            print(f"\n[DRY RUN] Would apply {applied}/{len(auto_apply_violations)} fixes")
        else:
            print(f"\n✅ Applied {applied}/{len(auto_apply_violations)} fixes successfully")

            if args.backup:
                print(f"\n💡 To rollback: python3 {__file__} --rollback {backup_dir}")

elif args.apply:
    suggestions_file = Path(args.apply)

    if not suggestions_file.exists():
        print(f"❌ Error: Suggestions file not found: {suggestions_file}")
        return

    with suggestions_file.open('r', encoding='utf-8') as f:
        suggestions = json.load(f)

    # Create backup if requested
    if args.backup and not args.dry_run:
        affected_files = list(set([repo_root / s['file_path'] for s in suggestions]))
        timestamp = datetime.now().strftime('%Y%m%d-%H%M%S')
        backup_dir = repo_root / 'backups' / f'md040-backup-{timestamp}'

        print(f"\nCreating backup of {len(affected_files)} files...")
        if not SafetyManager.create_backup(affected_files, backup_dir):
            print("❌ Backup failed - aborting")
            return

        print(f"✅ Backup created: {backup_dir}")

    print(f"\nApplying {len(suggestions)} fixes from {suggestions_file}...")

    applied = 0
    for suggestion in suggestions:
        file_path = repo_root / suggestion['file_path']
        if fixer.apply_fix(file_path, suggestion['line_number'], suggestion['suggested_language']):
            applied += 1

    if args.dry_run:
        print(f"\n[DRY RUN] Would apply {applied}/{len(suggestions)} fixes")
    else:
        print(f"\n✅ Applied {applied}/{len(suggestions)} fixes successfully")

else:
    parser.print_help()

if name == 'main': main()

#!/usr/bin/env python3 """​

Dry run (safe preview)

Detect and generate suggestions

Auto-apply high-confidence with backup

Rollback changes

#!/usr/bin/env python3 """