Skip to main content

scripts-codebase-analyzer

#!/usr/bin/env python3 """ CODITECT Codebase Analyzer

C3.x suite for comprehensive codebase analysis:

  • C3.1: Design pattern detection (via PatternDetector)
  • C3.2: Test example extraction
  • C3.3: How-to guide generation
  • C3.4: Configuration pattern extraction
  • C3.5: Architecture overview generation
  • C3.7: Architectural pattern detection

Author: CODITECT Version: 1.0.0 """

import ast import json import re from dataclasses import dataclass, field from pathlib import Path from typing import Optional, Any from enum import Enum

from .pattern_detector import PatternDetector, PatternReport

class ArchitecturePattern(Enum): """High-level architectural patterns.""" MVC = "mvc" MVVM = "mvvm" MICROSERVICES = "microservices" MONOLITH = "monolith" EVENT_DRIVEN = "event_driven" LAYERED = "layered" HEXAGONAL = "hexagonal" PLUGIN = "plugin" COMPONENT_BASED = "component_based"

@dataclass class TestExample: """Extracted test example.""" name: str file_path: str code: str description: str test_type: str # unit, integration, e2e setup_code: Optional[str] = None assertions: list[str] = field(default_factory=list)

@dataclass class HowToGuide: """Generated how-to guide.""" title: str description: str steps: list[str] code_examples: list[str] prerequisites: list[str] = field(default_factory=list) related_files: list[str] = field(default_factory=list)

@dataclass class ConfigPattern: """Detected configuration pattern.""" name: str file_path: str config_type: str # env, json, yaml, toml, ini keys: list[str] example_values: dict[str, Any] description: str = ""

@dataclass class ArchitectureOverview: """Codebase architecture overview.""" primary_pattern: ArchitecturePattern secondary_patterns: list[ArchitecturePattern] layers: list[str] entry_points: list[str] data_flow: list[tuple[str, str, str]] # (from, to, description) dependencies: dict[str, list[str]] confidence: float = 0.0

@dataclass class CodebaseReport: """Complete codebase analysis report.""" root_path: str languages: dict[str, int] # language -> file count total_files: int total_lines: int pattern_reports: list[PatternReport] test_examples: list[TestExample] how_to_guides: list[HowToGuide] config_patterns: list[ConfigPattern] architecture: Optional[ArchitectureOverview] = None

def to_dict(self) -> dict:
return {
"root_path": self.root_path,
"languages": self.languages,
"total_files": self.total_files,
"total_lines": self.total_lines,
"patterns_found": sum(len(r.patterns) for r in self.pattern_reports),
"test_examples": len(self.test_examples),
"how_to_guides": len(self.how_to_guides),
"config_patterns": len(self.config_patterns),
"architecture": self.architecture.primary_pattern.value if self.architecture else None
}

class CodebaseAnalyzer: """ CODITECT Codebase Analyzer

Implements C3.x analysis suite:
- C3.1: Pattern detection (delegates to PatternDetector)
- C3.2: Test extraction with example parsing
- C3.3: How-to guide generation from docstrings/comments
- C3.4: Configuration pattern detection
- C3.5: Architecture overview with data flow
- C3.7: Architectural pattern detection

Improvements over basic analyzers:
- Multi-language support (9 languages)
- Context-aware extraction (understands test frameworks)
- Intelligent guide generation from code patterns
- Dependency graph analysis
"""

# File extensions by language
LANGUAGE_EXTENSIONS = {
"python": [".py"],
"javascript": [".js", ".mjs"],
"typescript": [".ts", ".tsx"],
"java": [".java"],
"go": [".go"],
"rust": [".rs"],
"cpp": [".cpp", ".cc", ".cxx", ".hpp", ".h"],
"csharp": [".cs"],
"ruby": [".rb"]
}

# Test framework patterns
TEST_PATTERNS = {
"python": {
"frameworks": ["pytest", "unittest", "nose"],
"file_patterns": [r"test_.*\.py$", r".*_test\.py$", r"tests\.py$"],
"function_patterns": [r"^test_", r"^Test"],
"assertion_patterns": [r"assert\s+", r"self\.assert", r"pytest\.raises"]
},
"javascript": {
"frameworks": ["jest", "mocha", "jasmine", "vitest"],
"file_patterns": [r".*\.test\.(js|ts)$", r".*\.spec\.(js|ts)$"],
"function_patterns": [r"^it\(", r"^test\(", r"^describe\("],
"assertion_patterns": [r"expect\(", r"assert\.", r"should\."]
},
"go": {
"frameworks": ["testing"],
"file_patterns": [r".*_test\.go$"],
"function_patterns": [r"^func\s+Test"],
"assertion_patterns": [r"t\.Error", r"t\.Fatal", r"assert\."]
},
"rust": {
"frameworks": ["cargo test"],
"file_patterns": [r".*_test\.rs$", r"tests/.*\.rs$"],
"function_patterns": [r"#\[test\]", r"#\[tokio::test\]"],
"assertion_patterns": [r"assert!", r"assert_eq!", r"assert_ne!"]
}
}

# Configuration file patterns
CONFIG_PATTERNS = {
"env": [r"\.env.*$", r"\.env$"],
"json": [r".*config.*\.json$", r"package\.json$", r"tsconfig\.json$"],
"yaml": [r".*\.ya?ml$", r".*config.*\.ya?ml$"],
"toml": [r".*\.toml$", r"Cargo\.toml$", r"pyproject\.toml$"],
"ini": [r".*\.ini$", r"setup\.cfg$", r"tox\.ini$"]
}

# Architecture indicators
ARCHITECTURE_INDICATORS = {
ArchitecturePattern.MVC: {
"dirs": ["models", "views", "controllers"],
"files": ["controller.py", "view.py", "model.py"],
"patterns": [r"class\s+\w+Controller", r"class\s+\w+View"]
},
ArchitecturePattern.MICROSERVICES: {
"dirs": ["services", "api", "gateway"],
"files": ["docker-compose.yml", "kubernetes.yaml"],
"patterns": [r"@Service", r"@RestController", r"@app\.route"]
},
ArchitecturePattern.EVENT_DRIVEN: {
"dirs": ["events", "handlers", "listeners", "subscribers"],
"files": [],
"patterns": [r"@EventHandler", r"on_event", r"emit\(", r"subscribe\("]
},
ArchitecturePattern.LAYERED: {
"dirs": ["domain", "application", "infrastructure", "presentation"],
"files": [],
"patterns": [r"class\s+\w+Repository", r"class\s+\w+Service"]
},
ArchitecturePattern.HEXAGONAL: {
"dirs": ["ports", "adapters", "core", "domain"],
"files": [],
"patterns": [r"interface\s+\w+Port", r"class\s+\w+Adapter"]
},
ArchitecturePattern.COMPONENT_BASED: {
"dirs": ["components", "modules", "features"],
"files": [],
"patterns": [r"@Component", r"export\s+default\s+function\s+\w+"]
}
}

def __init__(self, root_path: Path):
self.root_path = Path(root_path)
self.pattern_detector = PatternDetector()

def analyze(
self,
skip_tests: bool = False,
skip_guides: bool = False,
skip_configs: bool = False,
skip_architecture: bool = False,
max_files: int = 1000
) -> CodebaseReport:
"""
Perform comprehensive codebase analysis.

Args:
skip_tests: Skip C3.2 test extraction
skip_guides: Skip C3.3 guide generation
skip_configs: Skip C3.4 config detection
skip_architecture: Skip C3.5/C3.7 architecture analysis
max_files: Maximum files to analyze

Returns:
CodebaseReport with all analysis results
"""
# Collect files
all_files = self._collect_files(max_files)
languages = self._detect_languages(all_files)
total_lines = self._count_lines(all_files)

# C3.1: Pattern detection
pattern_reports = self.pattern_detector.detect_in_directory(self.root_path)

# C3.2: Test extraction
test_examples = []
if not skip_tests:
test_examples = self._extract_tests(all_files)

# C3.3: How-to guides
how_to_guides = []
if not skip_guides:
how_to_guides = self._generate_guides(all_files)

# C3.4: Config patterns
config_patterns = []
if not skip_configs:
config_patterns = self._detect_configs(all_files)

# C3.5 + C3.7: Architecture
architecture = None
if not skip_architecture:
architecture = self._analyze_architecture(all_files)

return CodebaseReport(
root_path=str(self.root_path),
languages=languages,
total_files=len(all_files),
total_lines=total_lines,
pattern_reports=pattern_reports,
test_examples=test_examples,
how_to_guides=how_to_guides,
config_patterns=config_patterns,
architecture=architecture
)

def _collect_files(self, max_files: int) -> list[Path]:
"""Collect all source files."""
files = []
extensions = set()
for exts in self.LANGUAGE_EXTENSIONS.values():
extensions.update(exts)

for ext in extensions:
for file_path in self.root_path.rglob(f"*{ext}"):
# Skip hidden, vendor, node_modules
str_path = str(file_path)
if any(skip in str_path for skip in [
"/.", "node_modules", "vendor", "__pycache__",
"venv", ".venv", "dist", "build"
]):
continue
files.append(file_path)
if len(files) >= max_files:
return files

return files

def _detect_languages(self, files: list[Path]) -> dict[str, int]:
"""Detect languages by file extension."""
counts = {}
for file_path in files:
ext = file_path.suffix.lower()
for lang, exts in self.LANGUAGE_EXTENSIONS.items():
if ext in exts:
counts[lang] = counts.get(lang, 0) + 1
break
return counts

def _count_lines(self, files: list[Path]) -> int:
"""Count total lines of code."""
total = 0
for file_path in files:
try:
total += len(file_path.read_text().split('\n'))
except Exception:
pass
return total

def _extract_tests(self, files: list[Path]) -> list[TestExample]:
"""C3.2: Extract test examples from test files."""
examples = []

for file_path in files:
# Determine language
lang = self._file_language(file_path)
if not lang or lang not in self.TEST_PATTERNS:
continue

test_config = self.TEST_PATTERNS[lang]

# Check if this is a test file
is_test = any(
re.search(pattern, str(file_path))
for pattern in test_config["file_patterns"]
)
if not is_test:
continue

try:
content = file_path.read_text()
except Exception:
continue

# Extract tests based on language
if lang == "python":
examples.extend(self._extract_python_tests(file_path, content))
elif lang in ["javascript", "typescript"]:
examples.extend(self._extract_js_tests(file_path, content))
elif lang == "go":
examples.extend(self._extract_go_tests(file_path, content))

# Limit examples
if len(examples) >= 50:
break

return examples

def _extract_python_tests(self, file_path: Path, content: str) -> list[TestExample]:
"""Extract Python test examples."""
examples = []

try:
tree = ast.parse(content)
except SyntaxError:
return examples

for node in ast.walk(tree):
if isinstance(node, ast.FunctionDef) and node.name.startswith("test_"):
# Get function source
lines = content.split('\n')
start = node.lineno - 1
end = node.end_lineno if hasattr(node, 'end_lineno') else start + 20

code = '\n'.join(lines[start:end])

# Extract docstring
description = ""
if (node.body and isinstance(node.body[0], ast.Expr) and
isinstance(node.body[0].value, ast.Constant)):
description = str(node.body[0].value.value)

# Find assertions
assertions = re.findall(r'(assert\s+\w+.*|self\.assert\w+\(.*?\))', code)

examples.append(TestExample(
name=node.name,
file_path=str(file_path),
code=code,
description=description or f"Test function {node.name}",
test_type="unit",
assertions=assertions[:5]
))

return examples

def _extract_js_tests(self, file_path: Path, content: str) -> list[TestExample]:
"""Extract JavaScript/TypeScript test examples."""
examples = []

# Find test/it blocks
test_pattern = r'(it|test)\s*\(\s*[\'"]([^\'"]+)[\'"]\s*,\s*(async\s*)?\(\)\s*=>\s*\{([^}]+(?:\{[^}]*\}[^}]*)*)\}'

for match in re.finditer(test_pattern, content, re.MULTILINE | re.DOTALL):
test_type = match.group(1)
name = match.group(2)
code = match.group(0)

# Find assertions
assertions = re.findall(r'expect\([^)]+\)\.[^;]+', code)

examples.append(TestExample(
name=name,
file_path=str(file_path),
code=code[:500], # Truncate
description=f"Test: {name}",
test_type="unit",
assertions=assertions[:5]
))

return examples

def _extract_go_tests(self, file_path: Path, content: str) -> list[TestExample]:
"""Extract Go test examples."""
examples = []

# Find Test functions
test_pattern = r'func\s+(Test\w+)\s*\(t\s+\*testing\.T\)\s*\{([^}]+(?:\{[^}]*\}[^}]*)*)\}'

for match in re.finditer(test_pattern, content, re.MULTILINE | re.DOTALL):
name = match.group(1)
code = match.group(0)

# Find assertions
assertions = re.findall(r't\.(Error|Fatal|Log)\([^)]+\)', code)

examples.append(TestExample(
name=name,
file_path=str(file_path),
code=code[:500],
description=f"Go test function {name}",
test_type="unit",
assertions=assertions[:5]
))

return examples

def _generate_guides(self, files: list[Path]) -> list[HowToGuide]:
"""C3.3: Generate how-to guides from code patterns."""
guides = []

# Guide topics to look for
topics = {
"authentication": ["auth", "login", "logout", "jwt", "oauth", "session"],
"database": ["database", "db", "model", "migration", "query", "orm"],
"api": ["api", "endpoint", "route", "handler", "controller"],
"deployment": ["deploy", "docker", "kubernetes", "ci", "cd"],
"testing": ["test", "spec", "mock", "fixture"],
"configuration": ["config", "settings", "env", "options"]
}

# Find files matching topics
topic_files: dict[str, list[Path]] = {topic: [] for topic in topics}

for file_path in files:
name_lower = file_path.name.lower()
path_lower = str(file_path).lower()

for topic, keywords in topics.items():
if any(kw in name_lower or kw in path_lower for kw in keywords):
topic_files[topic].append(file_path)

# Generate guides for topics with files
for topic, files_list in topic_files.items():
if not files_list:
continue

# Extract information from files
code_examples = []
steps = []
prereqs = []

for file_path in files_list[:5]:
try:
content = file_path.read_text()

# Extract top-level docstring/comment
if file_path.suffix == ".py":
match = re.search(r'^"""([^"]+)"""', content, re.MULTILINE)
if match:
steps.append(match.group(1).strip()[:200])

# Find code examples (functions/classes)
code_blocks = self._extract_code_examples(content, file_path.suffix)
code_examples.extend(code_blocks[:2])

# Look for imports as prerequisites
imports = re.findall(r'^(?:import|from)\s+(\w+)', content, re.MULTILINE)
prereqs.extend(imports[:5])

except Exception:
continue

if code_examples or steps:
guides.append(HowToGuide(
title=f"How to implement {topic.replace('_', ' ').title()}",
description=f"Guide for implementing {topic} in this codebase",
steps=steps[:5] if steps else [f"See {topic} implementation in codebase"],
code_examples=code_examples[:3],
prerequisites=list(set(prereqs))[:5],
related_files=[str(f) for f in files_list[:5]]
))

return guides

def _extract_code_examples(self, content: str, extension: str) -> list[str]:
"""Extract code examples from content."""
examples = []

if extension == ".py":
# Extract Python functions with docstrings
pattern = r'(def\s+\w+\([^)]*\):[^\n]*\n\s+"""[^"]+"""[^)]+(?:\n\s+[^\n]+){0,10})'
for match in re.finditer(pattern, content):
examples.append(match.group(1)[:500])

elif extension in [".js", ".ts", ".tsx"]:
# Extract JS functions with comments
pattern = r'(/\*\*[^*]+\*/\s*)?(export\s+)?(async\s+)?function\s+\w+\([^)]*\)\s*\{[^}]{0,500}\}'
for match in re.finditer(pattern, content):
examples.append(match.group(0)[:500])

return examples[:5]

def _detect_configs(self, files: list[Path]) -> list[ConfigPattern]:
"""C3.4: Detect configuration patterns."""
configs = []

for file_path in files:
# Check config patterns
for config_type, patterns in self.CONFIG_PATTERNS.items():
if any(re.search(p, str(file_path)) for p in patterns):
try:
config = self._parse_config(file_path, config_type)
if config:
configs.append(config)
except Exception:
pass

if len(configs) >= 20:
break

return configs

def _parse_config(self, file_path: Path, config_type: str) -> Optional[ConfigPattern]:
"""Parse configuration file."""
content = file_path.read_text()
keys = []
example_values = {}

if config_type == "env":
# Parse .env format
for line in content.split('\n'):
if '=' in line and not line.startswith('#'):
key = line.split('=')[0].strip()
keys.append(key)
example_values[key] = "***" # Mask values

elif config_type == "json":
try:
data = json.loads(content)
if isinstance(data, dict):
keys = list(data.keys())[:20]
for k in keys[:5]:
v = data[k]
if isinstance(v, (str, int, bool)):
example_values[k] = v
except json.JSONDecodeError:
return None

elif config_type in ["yaml", "toml"]:
# Simple key extraction
keys = re.findall(r'^(\w+):', content, re.MULTILINE)[:20]

if not keys:
return None

return ConfigPattern(
name=file_path.name,
file_path=str(file_path),
config_type=config_type,
keys=keys,
example_values=example_values,
description=f"Configuration file: {file_path.name}"
)

def _analyze_architecture(self, files: list[Path]) -> ArchitectureOverview:
"""C3.5 + C3.7: Analyze codebase architecture."""
# Get all directories
dirs = set()
for f in files:
for parent in f.parents:
if parent != self.root_path:
dirs.add(parent.name.lower())

# Score architecture patterns
scores: dict[ArchitecturePattern, float] = {}

for pattern, indicators in self.ARCHITECTURE_INDICATORS.items():
score = 0.0

# Directory matches
dir_matches = sum(1 for d in indicators["dirs"] if d in dirs)
score += dir_matches * 0.3

# File matches
for f in files:
if any(f.name.lower() == indicator for indicator in indicators["files"]):
score += 0.2

# Content pattern matches (sample files)
for f in files[:100]:
try:
content = f.read_text()
for pat in indicators["patterns"]:
if re.search(pat, content):
score += 0.1
break
except Exception:
pass

scores[pattern] = min(score, 1.0)

# Determine primary and secondary patterns
sorted_patterns = sorted(scores.items(), key=lambda x: x[1], reverse=True)
primary = sorted_patterns[0][0] if sorted_patterns[0][1] > 0.2 else ArchitecturePattern.MONOLITH
secondary = [p for p, s in sorted_patterns[1:4] if s > 0.1]

# Detect layers
layers = []
layer_names = ["api", "service", "domain", "data", "infrastructure", "presentation", "core"]
for layer in layer_names:
if layer in dirs:
layers.append(layer)

# Find entry points
entry_points = []
entry_patterns = ["main.py", "app.py", "index.js", "index.ts", "main.go", "main.rs"]
for f in files:
if f.name.lower() in entry_patterns:
entry_points.append(str(f))

# Simple dependency detection
dependencies = self._analyze_dependencies(files[:50])

return ArchitectureOverview(
primary_pattern=primary,
secondary_patterns=secondary,
layers=layers,
entry_points=entry_points[:5],
data_flow=[], # Complex analysis, skip for now
dependencies=dependencies,
confidence=scores.get(primary, 0.0)
)

def _analyze_dependencies(self, files: list[Path]) -> dict[str, list[str]]:
"""Analyze file dependencies."""
deps: dict[str, list[str]] = {}

for file_path in files:
try:
content = file_path.read_text()
imports = []

# Python imports
if file_path.suffix == ".py":
imports = re.findall(r'^(?:from\s+(\S+)|import\s+(\S+))', content, re.MULTILINE)
imports = [i[0] or i[1] for i in imports]

# JS/TS imports
elif file_path.suffix in [".js", ".ts", ".tsx"]:
imports = re.findall(r'(?:import|require)\s*\(?[\'"]([^\'"]+)[\'"]', content)

# Go imports
elif file_path.suffix == ".go":
imports = re.findall(r'import\s+["\']([^"\']+)["\']', content)

if imports:
deps[str(file_path.relative_to(self.root_path))] = imports[:10]

except Exception:
pass

return deps

def _file_language(self, file_path: Path) -> Optional[str]:
"""Determine file language."""
ext = file_path.suffix.lower()
for lang, exts in self.LANGUAGE_EXTENSIONS.items():
if ext in exts:
return lang
return None