Pattern Finding Skill
Pattern Finding Skill
When to Use This Skill
Use this skill when implementing pattern finding patterns in your codebase.
How to Use This Skill
- Review the patterns and examples below
- Apply the relevant patterns to your implementation
- Follow the best practices outlined in this skill
Pattern matching, similar code discovery, anti-pattern detection, and duplication analysis for code quality improvement.
Core Capabilities
- Similar Code Detection - AST-based similarity, token matching, structural comparison
- Duplication Analysis - Exact duplicates, near-duplicates, code clones
- Anti-Pattern Detection - God objects, spaghetti code, tight coupling
- Refactoring Opportunities - Extract method, extract class, consolidate duplicates
- Pattern Extraction - Common idioms, reusable patterns, template generation
Clone Detection Engine
# scripts/clone_detector.py
import ast
import hashlib
from typing import List, Dict, Set, Tuple
from dataclasses import dataclass
from collections import defaultdict
from pathlib import Path
@dataclass
class CodeClone:
"""Represents a code clone."""
type: int # 1=exact, 2=renamed, 3=gapped, 4=semantic
locations: List[Tuple[str, int, int]] # (file, start_line, end_line)
similarity: float
code_snippet: str
size_lines: int
class CloneDetector:
"""Detect code clones using AST and token analysis."""
def __init__(self, min_clone_lines: int = 6, similarity_threshold: float = 0.85):
self.min_clone_lines = min_clone_lines
self.similarity_threshold = similarity_threshold
self.clones: List[CodeClone] = []
def detect(self, root_path: str) -> List[CodeClone]:
"""Detect all clones in codebase."""
# Type 1: Exact clones (same code, different whitespace/comments)
type1 = self._detect_exact_clones(root_path)
# Type 2: Renamed clones (same structure, different identifiers)
type2 = self._detect_renamed_clones(root_path)
# Type 3: Gapped clones (copied code with additions/deletions)
type3 = self._detect_gapped_clones(root_path)
self.clones = type1 + type2 + type3
return self.clones
def _detect_exact_clones(self, root_path: str) -> List[CodeClone]:
"""Detect Type 1 clones (exact duplicates)."""
clones = []
hash_map: Dict[str, List[Tuple[str, int, int, str]]] = defaultdict(list)
for py_file in Path(root_path).rglob('*.py'):
try:
with open(py_file, 'r') as f:
lines = f.readlines()
# Sliding window over lines
for i in range(len(lines) - self.min_clone_lines + 1):
window = lines[i:i + self.min_clone_lines]
# Normalize: remove whitespace and comments
normalized = self._normalize_code(window)
# Hash the normalized code
code_hash = hashlib.md5(normalized.encode()).hexdigest()
hash_map[code_hash].append((
str(py_file),
i + 1,
i + self.min_clone_lines,
''.join(window)
))
except Exception:
continue
# Find duplicates
for code_hash, locations in hash_map.items():
if len(locations) > 1:
clones.append(CodeClone(
type=1,
locations=[(loc[0], loc[1], loc[2]) for loc in locations],
similarity=1.0,
code_snippet=locations[0][3],
size_lines=self.min_clone_lines
))
return clones
def _detect_renamed_clones(self, root_path: str) -> List[CodeClone]:
"""Detect Type 2 clones (renamed identifiers)."""
clones = []
ast_map: Dict[str, List[Tuple[str, ast.AST, str]]] = defaultdict(list)
for py_file in Path(root_path).rglob('*.py'):
try:
with open(py_file, 'r') as f:
content = f.read()
tree = ast.parse(content)
# Extract function definitions
for node in ast.walk(tree):
if isinstance(node, ast.FunctionDef):
# Normalize AST (replace all names with placeholders)
normalized_ast = self._normalize_ast(node)
ast_hash = self._hash_ast(normalized_ast)
original_code = ast.get_source_segment(content, node)
ast_map[ast_hash].append((
str(py_file),
node,
original_code or ''
))
except Exception:
continue
# Find similar ASTs
for ast_hash, nodes in ast_map.items():
if len(nodes) > 1:
clones.append(CodeClone(
type=2,
locations=[
(n[0], n[1].lineno, n[1].end_lineno or n[1].lineno)
for n in nodes
],
similarity=0.95,
code_snippet=nodes[0][2],
size_lines=nodes[0][1].end_lineno - nodes[0][1].lineno + 1
))
return clones
def _detect_gapped_clones(self, root_path: str) -> List[CodeClone]:
"""Detect Type 3 clones (with gaps/modifications)."""
clones = []
# Use token-based similarity
token_sequences: Dict[str, List[Tuple[str, int, int, List[str]]]] = defaultdict(list)
for py_file in Path(root_path).rglob('*.py'):
try:
with open(py_file, 'r') as f:
lines = f.readlines()
# Extract token sequences
for i in range(len(lines) - self.min_clone_lines + 1):
window = lines[i:i + self.min_clone_lines]
tokens = self._tokenize_code(''.join(window))
# Create shingles (n-grams of tokens)
shingles = self._create_shingles(tokens, n=5)
signature = self._minhash(shingles)
token_sequences[signature].append((
str(py_file),
i + 1,
i + self.min_clone_lines,
tokens
))
except Exception:
continue
# Find similar token sequences
for signature, sequences in token_sequences.items():
if len(sequences) > 1:
# Calculate pairwise similarity
for i in range(len(sequences)):
for j in range(i + 1, len(sequences)):
similarity = self._calculate_token_similarity(
sequences[i][3],
sequences[j][3]
)
if similarity >= self.similarity_threshold:
clones.append(CodeClone(
type=3,
locations=[
(sequences[i][0], sequences[i][1], sequences[i][2]),
(sequences[j][0], sequences[j][1], sequences[j][2])
],
similarity=similarity,
code_snippet='',
size_lines=self.min_clone_lines
))
return clones
def _normalize_code(self, lines: List[str]) -> str:
"""Normalize code by removing whitespace and comments."""
normalized = []
for line in lines:
# Remove leading/trailing whitespace
line = line.strip()
# Skip comments
if line.startswith('#'):
continue
# Remove inline comments
if '#' in line:
line = line[:line.index('#')].strip()
if line:
normalized.append(line)
return '\n'.join(normalized)
def _normalize_ast(self, node: ast.AST) -> ast.AST:
"""Normalize AST by replacing variable names."""
class NameNormalizer(ast.NodeTransformer):
def __init__(self):
self.name_map = {}
self.counter = 0
def visit_Name(self, node):
if node.id not in self.name_map:
self.name_map[node.id] = f'VAR{self.counter}'
self.counter += 1
node.id = self.name_map[node.id]
return node
normalizer = NameNormalizer()
return normalizer.visit(node)
def _hash_ast(self, node: ast.AST) -> str:
"""Hash AST structure."""
ast_dump = ast.dump(node, annotate_fields=False)
return hashlib.md5(ast_dump.encode()).hexdigest()
def _tokenize_code(self, code: str) -> List[str]:
"""Tokenize code into tokens."""
import tokenize
import io
tokens = []
try:
for token in tokenize.generate_tokens(io.StringIO(code).readline):
if token.type not in (tokenize.COMMENT, tokenize.NL, tokenize.NEWLINE):
tokens.append(token.string)
except:
pass
return tokens
def _create_shingles(self, tokens: List[str], n: int = 5) -> Set[str]:
"""Create n-grams (shingles) from tokens."""
shingles = set()
for i in range(len(tokens) - n + 1):
shingle = ' '.join(tokens[i:i + n])
shingles.add(shingle)
return shingles
def _minhash(self, shingles: Set[str]) -> str:
"""Create MinHash signature for fast similarity estimation."""
# Simplified MinHash (use actual MinHash library in production)
if not shingles:
return ''
return hashlib.md5(min(shingles).encode()).hexdigest()
def _calculate_token_similarity(self, tokens1: List[str], tokens2: List[str]) -> float:
"""Calculate Jaccard similarity between token sequences."""
set1 = set(tokens1)
set2 = set(tokens2)
intersection = len(set1 & set2)
union = len(set1 | set2)
return intersection / union if union > 0 else 0.0
# Usage
detector = CloneDetector(min_clone_lines=6, similarity_threshold=0.85)
clones = detector.detect('/path/to/project')
print(f"Found {len(clones)} code clones:")
for clone in clones[:10]: # Show first 10
print(f"\nType {clone.type} Clone (similarity: {clone.similarity:.2f}):")
for file_path, start, end in clone.locations:
print(f" {file_path}:{start}-{end}")
Anti-Pattern Detector
// tools/anti-pattern-detector.ts
interface AntiPattern {
name: string;
description: string;
severity: 'critical' | 'high' | 'medium' | 'low';
indicators: (ast: any, metrics: any) => boolean;
fix?: string;
}
interface Detection {
pattern: string;
file: string;
location: { line: number; column: number };
severity: string;
message: string;
suggestion?: string;
}
class AntiPatternDetector {
private patterns: AntiPattern[] = [
{
name: 'God Object',
description: 'Class with too many responsibilities',
severity: 'high',
indicators: (ast, metrics) =>
metrics.methods > 20 || metrics.lines > 500 || metrics.dependencies > 15,
fix: 'Split into multiple focused classes using Extract Class refactoring'
},
{
name: 'Long Method',
description: 'Method with too many lines of code',
severity: 'medium',
indicators: (ast, metrics) => metrics.lines > 50 || metrics.complexity > 15,
fix: 'Extract smaller methods using Extract Method refactoring'
},
{
name: 'Long Parameter List',
description: 'Method with too many parameters',
severity: 'medium',
indicators: (ast, metrics) => metrics.parameters > 5,
fix: 'Introduce Parameter Object or Builder pattern'
},
{
name: 'Duplicate Code',
description: 'Similar or identical code in multiple places',
severity: 'high',
indicators: (ast, metrics) => metrics.duplication > 0.1, // 10% duplication
fix: 'Extract common code into shared function or class'
},
{
name: 'Dead Code',
description: 'Unused code that can be removed',
severity: 'low',
indicators: (ast, metrics) => metrics.unusedFunctions > 0 || metrics.unusedImports > 0,
fix: 'Remove unused code to improve maintainability'
},
{
name: 'Magic Numbers',
description: 'Unexplained numeric literals',
severity: 'low',
indicators: (ast, metrics) => metrics.magicNumbers > 3,
fix: 'Replace with named constants'
},
{
name: 'Tight Coupling',
description: 'Class depends on too many other classes',
severity: 'high',
indicators: (ast, metrics) => metrics.coupling > 10,
fix: 'Apply Dependency Injection and interface segregation'
},
{
name: 'Low Cohesion',
description: 'Class methods have little in common',
severity: 'medium',
indicators: (ast, metrics) => metrics.cohesion < 0.3,
fix: 'Split class into more cohesive units'
}
];
async detect(codebasePath: string): Promise<Detection[]> {
const detections: Detection[] = [];
const files = await this.findSourceFiles(codebasePath);
for (const file of files) {
const ast = await this.parseFile(file);
const metrics = await this.calculateMetrics(ast, file);
for (const pattern of this.patterns) {
if (pattern.indicators(ast, metrics)) {
detections.push({
pattern: pattern.name,
file,
location: this.getLocation(ast),
severity: pattern.severity,
message: pattern.description,
suggestion: pattern.fix
});
}
}
}
return detections;
}
private async calculateMetrics(ast: any, filePath: string): Promise<any> {
return {
lines: this.countLines(ast),
methods: this.countMethods(ast),
parameters: this.getMaxParameters(ast),
complexity: this.calculateComplexity(ast),
dependencies: this.countDependencies(ast),
duplication: await this.checkDuplication(filePath),
unusedFunctions: this.findUnusedFunctions(ast),
unusedImports: this.findUnusedImports(ast),
magicNumbers: this.countMagicNumbers(ast),
coupling: this.calculateCoupling(ast),
cohesion: this.calculateCohesion(ast)
};
}
private countMagicNumbers(ast: any): number {
let count = 0;
const allowedNumbers = new Set([0, 1, -1, 100, 1000]);
for (const node of this.walkAST(ast)) {
if (node.type === 'NumericLiteral') {
if (!allowedNumbers.has(node.value)) {
count++;
}
}
}
return count;
}
private calculateCoupling(ast: any): number {
const dependencies = new Set<string>();
for (const node of this.walkAST(ast)) {
if (node.type === 'ImportDeclaration' || node.type === 'Import') {
dependencies.add(node.source);
}
}
return dependencies.size;
}
private calculateCohesion(ast: any): number {
// LCOM4 (Lack of Cohesion of Methods)
// Returns value between 0 (no cohesion) and 1 (perfect cohesion)
const methods = this.extractMethods(ast);
const attributes = this.extractAttributes(ast);
if (methods.length === 0 || attributes.length === 0) {
return 1.0;
}
let totalAccess = 0;
for (const method of methods) {
for (const attr of attributes) {
if (method.uses.includes(attr)) {
totalAccess++;
}
}
}
const maxAccess = methods.length * attributes.length;
return maxAccess > 0 ? totalAccess / maxAccess : 0;
}
}
// Usage
const detector = new AntiPatternDetector();
const issues = await detector.detect('/path/to/project');
console.log(`Found ${issues.length} anti-patterns:\n`);
// Group by severity
const grouped = issues.reduce((acc, issue) => {
if (!acc[issue.severity]) acc[issue.severity] = [];
acc[issue.severity].push(issue);
return acc;
}, {});
for (const [severity, items] of Object.entries(grouped)) {
console.log(`\n${severity.toUpperCase()}: ${items.length} issues`);
items.forEach(item => {
console.log(` ${item.pattern} in ${item.file}`);
console.log(` ${item.message}`);
if (item.suggestion) {
console.log(` Suggestion: ${item.suggestion}`);
}
});
}
Similarity Search
# scripts/similarity_search.py
from typing import List, Tuple
import difflib
from dataclasses import dataclass
from pathlib import Path
@dataclass
class SimilarityMatch:
"""Pair of similar code segments."""
file1: str
line1: int
file2: str
line2: int
similarity: float
snippet1: str
snippet2: str
class SimilaritySearch:
"""Find similar code using multiple algorithms."""
def __init__(self, threshold: float = 0.80):
self.threshold = threshold
def find_similar(
self,
root_path: str,
min_lines: int = 5
) -> List[SimilarityMatch]:
"""Find all similar code segments."""
matches = []
files = list(Path(root_path).rglob('*.py'))
# Compare all file pairs
for i in range(len(files)):
for j in range(i + 1, len(files)):
file_matches = self._compare_files(
files[i],
files[j],
min_lines
)
matches.extend(file_matches)
return sorted(matches, key=lambda m: m.similarity, reverse=True)
def _compare_files(
self,
file1: Path,
file2: Path,
min_lines: int
) -> List[SimilarityMatch]:
"""Compare two files for similar segments."""
matches = []
with open(file1, 'r') as f1, open(file2, 'r') as f2:
lines1 = f1.readlines()
lines2 = f2.readlines()
# Sliding window comparison
for i in range(len(lines1) - min_lines + 1):
segment1 = lines1[i:i + min_lines]
for j in range(len(lines2) - min_lines + 1):
segment2 = lines2[j:j + min_lines]
similarity = self._calculate_similarity(segment1, segment2)
if similarity >= self.threshold:
matches.append(SimilarityMatch(
file1=str(file1),
line1=i + 1,
file2=str(file2),
line2=j + 1,
similarity=similarity,
snippet1=''.join(segment1),
snippet2=''.join(segment2)
))
return matches
def _calculate_similarity(
self,
lines1: List[str],
lines2: List[str]
) -> float:
"""Calculate similarity ratio between code segments."""
# Use SequenceMatcher for text similarity
matcher = difflib.SequenceMatcher(None, lines1, lines2)
return matcher.ratio()
# Usage
searcher = SimilaritySearch(threshold=0.80)
similar = searcher.find_similar('/path/to/project', min_lines=5)
print(f"Found {len(similar)} similar code segments:\n")
for match in similar[:10]:
print(f"Similarity: {match.similarity:.2%}")
print(f" {match.file1}:{match.line1}")
print(f" {match.file2}:{match.line2}")
print()
Usage Examples
Detect Code Clones
Apply pattern-finding skill to detect Type 1, 2, and 3 code clones with minimum 6 lines
Find Anti-Patterns
Apply pattern-finding skill to detect God Objects, Long Methods, and Tight Coupling issues
Similarity Search
Apply pattern-finding skill to find similar code segments across the codebase with 80% similarity threshold
Refactoring Opportunities
Apply pattern-finding skill to identify Extract Method and Extract Class refactoring opportunities
Integration Points
- codebase-analysis-patterns - Architecture and metrics analysis
- codebase-navigation - File discovery and structure understanding
- code-review-patterns - Review workflow and quality gates
Success Output
When successful, this skill MUST output:
✅ SKILL COMPLETE: pattern-finding
Completed:
- [x] Clone detection executed (Type 1, 2, 3 clones analyzed)
- [x] Anti-pattern detection completed
- [x] Similarity search performed
- [x] Refactoring opportunities identified
Outputs:
- Clone detection results with locations and similarity scores
- Anti-pattern report grouped by severity (critical/high/medium/low)
- Similar code segments ranked by similarity
- Refactoring recommendations with fix suggestions
Statistics:
- Code clones found: {count}
- Anti-patterns detected: {count}
- Similar segments: {count}
- Average similarity: {percentage}
Completion Checklist
Before marking this skill as complete, verify:
- Clone detection ran successfully for all code types (Type 1, 2, 3)
- Anti-pattern detector analyzed all target files
- Similarity search completed with threshold applied
- Results include file paths, line numbers, and severity classifications
- Recommendations include specific fix suggestions
- Output includes statistical summary
- No analysis errors or incomplete scans
Failure Indicators
This skill has FAILED if:
- ❌ AST parsing failed for target files
- ❌ Clone detection returned no results with no explanation
- ❌ Anti-pattern metrics calculation errored
- ❌ Similarity threshold resulted in zero matches without justification
- ❌ Output missing severity classifications
- ❌ No actionable recommendations provided
- ❌ Script execution timeout or memory errors
When NOT to Use
Do NOT use this skill when:
- Only need simple text search (use
Greptool instead) - Looking for exact duplicates only (use
fdupesorrdfindinstead) - Need real-time code review during development (use
code-review-patternsskill) - Target is non-code files (logs, configs) - not applicable
- Codebase is too large for full scan (>100K files) - use targeted analysis
- Need architecture-level insights (use
codebase-analysis-patternsskill) - Just need to count lines of code (use
cloctool)
Use alternative skills:
- codebase-analysis-patterns - For architecture and dependency analysis
- code-review-patterns - For PR review and quality gates
- refactoring-patterns - For specific refactoring transformations
Anti-Patterns (Avoid)
| Anti-Pattern | Problem | Solution |
|---|---|---|
| Running on entire codebase without filtering | Excessive runtime, too many results | Target specific directories or file types first |
| Ignoring clone context | False positives (boilerplate, generated code) | Exclude test fixtures, migrations, auto-generated files |
| Low similarity threshold (<0.7) | Too many irrelevant matches | Use 0.80-0.95 threshold, tune based on results |
| Not grouping results by severity | Overwhelming output, unclear priorities | Always group anti-patterns by severity (critical → low) |
| Running without output limits | Memory exhaustion on large codebases | Set max results or use pagination |
| Skipping duplication percentage check | Miss high-duplication areas | Calculate duplication percentage before refactoring |
| Applying all fixes blindly | Breaking working code | Review each recommendation, test before applying |
Principles
This skill embodies:
- #1 Recycle → Extend → Re-Use → Create - Find existing patterns to avoid duplication
- #5 Eliminate Ambiguity - Explicit similarity thresholds and severity classifications
- #6 Clear, Understandable, Explainable - Concrete examples with file locations and line numbers
- #8 No Assumptions - Verify clone types and thresholds with statistical validation
- #11 Don't Repeat Yourself (DRY) - Detect and eliminate code duplication systematically
Full Principles: CODITECT-STANDARD-AUTOMATION.md