scripts-codebase-analyzer

#!/usr/bin/env python3 """ CODITECT Codebase Analyzer

C3.x suite for comprehensive codebase analysis:

C3.1: Design pattern detection (via PatternDetector)
C3.2: Test example extraction
C3.3: How-to guide generation
C3.4: Configuration pattern extraction
C3.5: Architecture overview generation
C3.7: Architectural pattern detection

Author: CODITECT Version: 1.0.0 """

import ast import json import re from dataclasses import dataclass, field from pathlib import Path from typing import Optional, Any from enum import Enum

from .pattern_detector import PatternDetector, PatternReport

class ArchitecturePattern(Enum): """High-level architectural patterns.""" MVC = "mvc" MVVM = "mvvm" MICROSERVICES = "microservices" MONOLITH = "monolith" EVENT_DRIVEN = "event_driven" LAYERED = "layered" HEXAGONAL = "hexagonal" PLUGIN = "plugin" COMPONENT_BASED = "component_based"

@dataclass class TestExample: """Extracted test example.""" name: str file_path: str code: str description: str test_type: str # unit, integration, e2e setup_code: Optional[str] = None assertions: list[str] = field(default_factory=list)

@dataclass class HowToGuide: """Generated how-to guide.""" title: str description: str steps: list[str] code_examples: list[str] prerequisites: list[str] = field(default_factory=list) related_files: list[str] = field(default_factory=list)

@dataclass class ConfigPattern: """Detected configuration pattern.""" name: str file_path: str config_type: str # env, json, yaml, toml, ini keys: list[str] example_values: dict[str, Any] description: str = ""

@dataclass class ArchitectureOverview: """Codebase architecture overview.""" primary_pattern: ArchitecturePattern secondary_patterns: list[ArchitecturePattern] layers: list[str] entry_points: list[str] data_flow: list[tuple[str, str, str]] # (from, to, description) dependencies: dict[str, list[str]] confidence: float = 0.0

@dataclass class CodebaseReport: """Complete codebase analysis report.""" root_path: str languages: dict[str, int] # language -> file count total_files: int total_lines: int pattern_reports: list[PatternReport] test_examples: list[TestExample] how_to_guides: list[HowToGuide] config_patterns: list[ConfigPattern] architecture: Optional[ArchitectureOverview] = None

def to_dict(self) -> dict:
    return {
        "root_path": self.root_path,
        "languages": self.languages,
        "total_files": self.total_files,
        "total_lines": self.total_lines,
        "patterns_found": sum(len(r.patterns) for r in self.pattern_reports),
        "test_examples": len(self.test_examples),
        "how_to_guides": len(self.how_to_guides),
        "config_patterns": len(self.config_patterns),
        "architecture": self.architecture.primary_pattern.value if self.architecture else None
    }

class CodebaseAnalyzer: """ CODITECT Codebase Analyzer

Implements C3.x analysis suite:
- C3.1: Pattern detection (delegates to PatternDetector)
- C3.2: Test extraction with example parsing
- C3.3: How-to guide generation from docstrings/comments
- C3.4: Configuration pattern detection
- C3.5: Architecture overview with data flow
- C3.7: Architectural pattern detection

Improvements over basic analyzers:
- Multi-language support (9 languages)
- Context-aware extraction (understands test frameworks)
- Intelligent guide generation from code patterns
- Dependency graph analysis
"""

# File extensions by language
LANGUAGE_EXTENSIONS = {
    "python": [".py"],
    "javascript": [".js", ".mjs"],
    "typescript": [".ts", ".tsx"],
    "java": [".java"],
    "go": [".go"],
    "rust": [".rs"],
    "cpp": [".cpp", ".cc", ".cxx", ".hpp", ".h"],
    "csharp": [".cs"],
    "ruby": [".rb"]
}

# Test framework patterns
TEST_PATTERNS = {
    "python": {
        "frameworks": ["pytest", "unittest", "nose"],
        "file_patterns": [r"test_.*\.py$", r".*_test\.py$", r"tests\.py$"],
        "function_patterns": [r"^test_", r"^Test"],
        "assertion_patterns": [r"assert\s+", r"self\.assert", r"pytest\.raises"]
    },
    "javascript": {
        "frameworks": ["jest", "mocha", "jasmine", "vitest"],
        "file_patterns": [r".*\.test\.(js|ts)$", r".*\.spec\.(js|ts)$"],
        "function_patterns": [r"^it\(", r"^test\(", r"^describe\("],
        "assertion_patterns": [r"expect\(", r"assert\.", r"should\."]
    },
    "go": {
        "frameworks": ["testing"],
        "file_patterns": [r".*_test\.go$"],
        "function_patterns": [r"^func\s+Test"],
        "assertion_patterns": [r"t\.Error", r"t\.Fatal", r"assert\."]
    },
    "rust": {
        "frameworks": ["cargo test"],
        "file_patterns": [r".*_test\.rs$", r"tests/.*\.rs$"],
        "function_patterns": [r"#\[test\]", r"#\[tokio::test\]"],
        "assertion_patterns": [r"assert!", r"assert_eq!", r"assert_ne!"]
    }
}

# Configuration file patterns
CONFIG_PATTERNS = {
    "env": [r"\.env.*$", r"\.env$"],
    "json": [r".*config.*\.json$", r"package\.json$", r"tsconfig\.json$"],
    "yaml": [r".*\.ya?ml$", r".*config.*\.ya?ml$"],
    "toml": [r".*\.toml$", r"Cargo\.toml$", r"pyproject\.toml$"],
    "ini": [r".*\.ini$", r"setup\.cfg$", r"tox\.ini$"]
}

# Architecture indicators
ARCHITECTURE_INDICATORS = {
    ArchitecturePattern.MVC: {
        "dirs": ["models", "views", "controllers"],
        "files": ["controller.py", "view.py", "model.py"],
        "patterns": [r"class\s+\w+Controller", r"class\s+\w+View"]
    },
    ArchitecturePattern.MICROSERVICES: {
        "dirs": ["services", "api", "gateway"],
        "files": ["docker-compose.yml", "kubernetes.yaml"],
        "patterns": [r"@Service", r"@RestController", r"@app\.route"]
    },
    ArchitecturePattern.EVENT_DRIVEN: {
        "dirs": ["events", "handlers", "listeners", "subscribers"],
        "files": [],
        "patterns": [r"@EventHandler", r"on_event", r"emit\(", r"subscribe\("]
    },
    ArchitecturePattern.LAYERED: {
        "dirs": ["domain", "application", "infrastructure", "presentation"],
        "files": [],
        "patterns": [r"class\s+\w+Repository", r"class\s+\w+Service"]
    },
    ArchitecturePattern.HEXAGONAL: {
        "dirs": ["ports", "adapters", "core", "domain"],
        "files": [],
        "patterns": [r"interface\s+\w+Port", r"class\s+\w+Adapter"]
    },
    ArchitecturePattern.COMPONENT_BASED: {
        "dirs": ["components", "modules", "features"],
        "files": [],
        "patterns": [r"@Component", r"export\s+default\s+function\s+\w+"]
    }
}

def __init__(self, root_path: Path):
    self.root_path = Path(root_path)
    self.pattern_detector = PatternDetector()

def analyze(
    self,
    skip_tests: bool = False,
    skip_guides: bool = False,
    skip_configs: bool = False,
    skip_architecture: bool = False,
    max_files: int = 1000
) -> CodebaseReport:
    """
    Perform comprehensive codebase analysis.

    Args:
        skip_tests: Skip C3.2 test extraction
        skip_guides: Skip C3.3 guide generation
        skip_configs: Skip C3.4 config detection
        skip_architecture: Skip C3.5/C3.7 architecture analysis
        max_files: Maximum files to analyze

    Returns:
        CodebaseReport with all analysis results
    """
    # Collect files
    all_files = self._collect_files(max_files)
    languages = self._detect_languages(all_files)
    total_lines = self._count_lines(all_files)

    # C3.1: Pattern detection
    pattern_reports = self.pattern_detector.detect_in_directory(self.root_path)

    # C3.2: Test extraction
    test_examples = []
    if not skip_tests:
        test_examples = self._extract_tests(all_files)

    # C3.3: How-to guides
    how_to_guides = []
    if not skip_guides:
        how_to_guides = self._generate_guides(all_files)

    # C3.4: Config patterns
    config_patterns = []
    if not skip_configs:
        config_patterns = self._detect_configs(all_files)

    # C3.5 + C3.7: Architecture
    architecture = None
    if not skip_architecture:
        architecture = self._analyze_architecture(all_files)

    return CodebaseReport(
        root_path=str(self.root_path),
        languages=languages,
        total_files=len(all_files),
        total_lines=total_lines,
        pattern_reports=pattern_reports,
        test_examples=test_examples,
        how_to_guides=how_to_guides,
        config_patterns=config_patterns,
        architecture=architecture
    )

def _collect_files(self, max_files: int) -> list[Path]:
    """Collect all source files."""
    files = []
    extensions = set()
    for exts in self.LANGUAGE_EXTENSIONS.values():
        extensions.update(exts)

    for ext in extensions:
        for file_path in self.root_path.rglob(f"*{ext}"):
            # Skip hidden, vendor, node_modules
            str_path = str(file_path)
            if any(skip in str_path for skip in [
                "/.", "node_modules", "vendor", "__pycache__",
                "venv", ".venv", "dist", "build"
            ]):
                continue
            files.append(file_path)
            if len(files) >= max_files:
                return files

    return files

def _detect_languages(self, files: list[Path]) -> dict[str, int]:
    """Detect languages by file extension."""
    counts = {}
    for file_path in files:
        ext = file_path.suffix.lower()
        for lang, exts in self.LANGUAGE_EXTENSIONS.items():
            if ext in exts:
                counts[lang] = counts.get(lang, 0) + 1
                break
    return counts

def _count_lines(self, files: list[Path]) -> int:
    """Count total lines of code."""
    total = 0
    for file_path in files:
        try:
            total += len(file_path.read_text().split('\n'))
        except Exception:
            pass
    return total

def _extract_tests(self, files: list[Path]) -> list[TestExample]:
    """C3.2: Extract test examples from test files."""
    examples = []

    for file_path in files:
        # Determine language
        lang = self._file_language(file_path)
        if not lang or lang not in self.TEST_PATTERNS:
            continue

        test_config = self.TEST_PATTERNS[lang]

        # Check if this is a test file
        is_test = any(
            re.search(pattern, str(file_path))
            for pattern in test_config["file_patterns"]
        )
        if not is_test:
            continue

        try:
            content = file_path.read_text()
        except Exception:
            continue

        # Extract tests based on language
        if lang == "python":
            examples.extend(self._extract_python_tests(file_path, content))
        elif lang in ["javascript", "typescript"]:
            examples.extend(self._extract_js_tests(file_path, content))
        elif lang == "go":
            examples.extend(self._extract_go_tests(file_path, content))

        # Limit examples
        if len(examples) >= 50:
            break

    return examples

def _extract_python_tests(self, file_path: Path, content: str) -> list[TestExample]:
    """Extract Python test examples."""
    examples = []

    try:
        tree = ast.parse(content)
    except SyntaxError:
        return examples

    for node in ast.walk(tree):
        if isinstance(node, ast.FunctionDef) and node.name.startswith("test_"):
            # Get function source
            lines = content.split('\n')
            start = node.lineno - 1
            end = node.end_lineno if hasattr(node, 'end_lineno') else start + 20

            code = '\n'.join(lines[start:end])

            # Extract docstring
            description = ""
            if (node.body and isinstance(node.body[0], ast.Expr) and
                isinstance(node.body[0].value, ast.Constant)):
                description = str(node.body[0].value.value)

            # Find assertions
            assertions = re.findall(r'(assert\s+\w+.*|self\.assert\w+\(.*?\))', code)

            examples.append(TestExample(
                name=node.name,
                file_path=str(file_path),
                code=code,
                description=description or f"Test function {node.name}",
                test_type="unit",
                assertions=assertions[:5]
            ))

    return examples

def _extract_js_tests(self, file_path: Path, content: str) -> list[TestExample]:
    """Extract JavaScript/TypeScript test examples."""
    examples = []

    # Find test/it blocks
    test_pattern = r'(it|test)\s*\(\s*[\'"]([^\'"]+)[\'"]\s*,\s*(async\s*)?\(\)\s*=>\s*\{([^}]+(?:\{[^}]*\}[^}]*)*)\}'

    for match in re.finditer(test_pattern, content, re.MULTILINE | re.DOTALL):
        test_type = match.group(1)
        name = match.group(2)
        code = match.group(0)

        # Find assertions
        assertions = re.findall(r'expect\([^)]+\)\.[^;]+', code)

        examples.append(TestExample(
            name=name,
            file_path=str(file_path),
            code=code[:500],  # Truncate
            description=f"Test: {name}",
            test_type="unit",
            assertions=assertions[:5]
        ))

    return examples

def _extract_go_tests(self, file_path: Path, content: str) -> list[TestExample]:
    """Extract Go test examples."""
    examples = []

    # Find Test functions
    test_pattern = r'func\s+(Test\w+)\s*\(t\s+\*testing\.T\)\s*\{([^}]+(?:\{[^}]*\}[^}]*)*)\}'

    for match in re.finditer(test_pattern, content, re.MULTILINE | re.DOTALL):
        name = match.group(1)
        code = match.group(0)

        # Find assertions
        assertions = re.findall(r't\.(Error|Fatal|Log)\([^)]+\)', code)

        examples.append(TestExample(
            name=name,
            file_path=str(file_path),
            code=code[:500],
            description=f"Go test function {name}",
            test_type="unit",
            assertions=assertions[:5]
        ))

    return examples

def _generate_guides(self, files: list[Path]) -> list[HowToGuide]:
    """C3.3: Generate how-to guides from code patterns."""
    guides = []

    # Guide topics to look for
    topics = {
        "authentication": ["auth", "login", "logout", "jwt", "oauth", "session"],
        "database": ["database", "db", "model", "migration", "query", "orm"],
        "api": ["api", "endpoint", "route", "handler", "controller"],
        "deployment": ["deploy", "docker", "kubernetes", "ci", "cd"],
        "testing": ["test", "spec", "mock", "fixture"],
        "configuration": ["config", "settings", "env", "options"]
    }

    # Find files matching topics
    topic_files: dict[str, list[Path]] = {topic: [] for topic in topics}

    for file_path in files:
        name_lower = file_path.name.lower()
        path_lower = str(file_path).lower()

        for topic, keywords in topics.items():
            if any(kw in name_lower or kw in path_lower for kw in keywords):
                topic_files[topic].append(file_path)

    # Generate guides for topics with files
    for topic, files_list in topic_files.items():
        if not files_list:
            continue

        # Extract information from files
        code_examples = []
        steps = []
        prereqs = []

        for file_path in files_list[:5]:
            try:
                content = file_path.read_text()

                # Extract top-level docstring/comment
                if file_path.suffix == ".py":
                    match = re.search(r'^"""([^"]+)"""', content, re.MULTILINE)
                    if match:
                        steps.append(match.group(1).strip()[:200])

                # Find code examples (functions/classes)
                code_blocks = self._extract_code_examples(content, file_path.suffix)
                code_examples.extend(code_blocks[:2])

                # Look for imports as prerequisites
                imports = re.findall(r'^(?:import|from)\s+(\w+)', content, re.MULTILINE)
                prereqs.extend(imports[:5])

            except Exception:
                continue

        if code_examples or steps:
            guides.append(HowToGuide(
                title=f"How to implement {topic.replace('_', ' ').title()}",
                description=f"Guide for implementing {topic} in this codebase",
                steps=steps[:5] if steps else [f"See {topic} implementation in codebase"],
                code_examples=code_examples[:3],
                prerequisites=list(set(prereqs))[:5],
                related_files=[str(f) for f in files_list[:5]]
            ))

    return guides

def _extract_code_examples(self, content: str, extension: str) -> list[str]:
    """Extract code examples from content."""
    examples = []

    if extension == ".py":
        # Extract Python functions with docstrings
        pattern = r'(def\s+\w+\([^)]*\):[^\n]*\n\s+"""[^"]+"""[^)]+(?:\n\s+[^\n]+){0,10})'
        for match in re.finditer(pattern, content):
            examples.append(match.group(1)[:500])

    elif extension in [".js", ".ts", ".tsx"]:
        # Extract JS functions with comments
        pattern = r'(/\*\*[^*]+\*/\s*)?(export\s+)?(async\s+)?function\s+\w+\([^)]*\)\s*\{[^}]{0,500}\}'
        for match in re.finditer(pattern, content):
            examples.append(match.group(0)[:500])

    return examples[:5]

def _detect_configs(self, files: list[Path]) -> list[ConfigPattern]:
    """C3.4: Detect configuration patterns."""
    configs = []

    for file_path in files:
        # Check config patterns
        for config_type, patterns in self.CONFIG_PATTERNS.items():
            if any(re.search(p, str(file_path)) for p in patterns):
                try:
                    config = self._parse_config(file_path, config_type)
                    if config:
                        configs.append(config)
                except Exception:
                    pass

        if len(configs) >= 20:
            break

    return configs

def _parse_config(self, file_path: Path, config_type: str) -> Optional[ConfigPattern]:
    """Parse configuration file."""
    content = file_path.read_text()
    keys = []
    example_values = {}

    if config_type == "env":
        # Parse .env format
        for line in content.split('\n'):
            if '=' in line and not line.startswith('#'):
                key = line.split('=')[0].strip()
                keys.append(key)
                example_values[key] = "***"  # Mask values

    elif config_type == "json":
        try:
            data = json.loads(content)
            if isinstance(data, dict):
                keys = list(data.keys())[:20]
                for k in keys[:5]:
                    v = data[k]
                    if isinstance(v, (str, int, bool)):
                        example_values[k] = v
        except json.JSONDecodeError:
            return None

    elif config_type in ["yaml", "toml"]:
        # Simple key extraction
        keys = re.findall(r'^(\w+):', content, re.MULTILINE)[:20]

    if not keys:
        return None

    return ConfigPattern(
        name=file_path.name,
        file_path=str(file_path),
        config_type=config_type,
        keys=keys,
        example_values=example_values,
        description=f"Configuration file: {file_path.name}"
    )

def _analyze_architecture(self, files: list[Path]) -> ArchitectureOverview:
    """C3.5 + C3.7: Analyze codebase architecture."""
    # Get all directories
    dirs = set()
    for f in files:
        for parent in f.parents:
            if parent != self.root_path:
                dirs.add(parent.name.lower())

    # Score architecture patterns
    scores: dict[ArchitecturePattern, float] = {}

    for pattern, indicators in self.ARCHITECTURE_INDICATORS.items():
        score = 0.0

        # Directory matches
        dir_matches = sum(1 for d in indicators["dirs"] if d in dirs)
        score += dir_matches * 0.3

        # File matches
        for f in files:
            if any(f.name.lower() == indicator for indicator in indicators["files"]):
                score += 0.2

        # Content pattern matches (sample files)
        for f in files[:100]:
            try:
                content = f.read_text()
                for pat in indicators["patterns"]:
                    if re.search(pat, content):
                        score += 0.1
                        break
            except Exception:
                pass

        scores[pattern] = min(score, 1.0)

    # Determine primary and secondary patterns
    sorted_patterns = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    primary = sorted_patterns[0][0] if sorted_patterns[0][1] > 0.2 else ArchitecturePattern.MONOLITH
    secondary = [p for p, s in sorted_patterns[1:4] if s > 0.1]

    # Detect layers
    layers = []
    layer_names = ["api", "service", "domain", "data", "infrastructure", "presentation", "core"]
    for layer in layer_names:
        if layer in dirs:
            layers.append(layer)

    # Find entry points
    entry_points = []
    entry_patterns = ["main.py", "app.py", "index.js", "index.ts", "main.go", "main.rs"]
    for f in files:
        if f.name.lower() in entry_patterns:
            entry_points.append(str(f))

    # Simple dependency detection
    dependencies = self._analyze_dependencies(files[:50])

    return ArchitectureOverview(
        primary_pattern=primary,
        secondary_patterns=secondary,
        layers=layers,
        entry_points=entry_points[:5],
        data_flow=[],  # Complex analysis, skip for now
        dependencies=dependencies,
        confidence=scores.get(primary, 0.0)
    )

def _analyze_dependencies(self, files: list[Path]) -> dict[str, list[str]]:
    """Analyze file dependencies."""
    deps: dict[str, list[str]] = {}

    for file_path in files:
        try:
            content = file_path.read_text()
            imports = []

            # Python imports
            if file_path.suffix == ".py":
                imports = re.findall(r'^(?:from\s+(\S+)|import\s+(\S+))', content, re.MULTILINE)
                imports = [i[0] or i[1] for i in imports]

            # JS/TS imports
            elif file_path.suffix in [".js", ".ts", ".tsx"]:
                imports = re.findall(r'(?:import|require)\s*\(?[\'"]([^\'"]+)[\'"]', content)

            # Go imports
            elif file_path.suffix == ".go":
                imports = re.findall(r'import\s+["\']([^"\']+)["\']', content)

            if imports:
                deps[str(file_path.relative_to(self.root_path))] = imports[:10]

        except Exception:
            pass

    return deps

def _file_language(self, file_path: Path) -> Optional[str]:
    """Determine file language."""
    ext = file_path.suffix.lower()
    for lang, exts in self.LANGUAGE_EXTENSIONS.items():
        if ext in exts:
            return lang
    return None