Skip to main content

scripts-component-classifier

#!/usr/bin/env python3 """

title: Component Classifier component_type: script version: 1.0.0 audience: contributor status: active summary: Shared context-aware component classification for CODITECT keywords:

  • classification
  • components
  • skills
  • agents
  • commands created: 2026-01-13 updated: 2026-01-13

Shared Component Classifier for CODITECT

Provides context-aware classification of component invocations (skills, agents, commands, hooks) used by:

  • session-retrospective.py (skill learning system)
  • unified-message-extractor.py (message storage system)
  • context-db.py (database indexing)

The classifier uses multiple signals:

  1. Pattern hints from regex matches (highest confidence)
  2. Surrounding context analysis
  3. Filesystem lookup (fallback)

Usage: from component_classifier import ComponentClassifier

classifier = ComponentClassifier(coditect_root)
comp_type = classifier.classify("orient", context_text)
# Returns: 'command'

invocations = classifier.extract_invocations(transcript_text)
# Returns: [{'name': 'orient', 'type': 'command', 'context': '...'}, ...]

"""

import re from pathlib import Path from typing import Dict, List, Optional, Set, Tuple

class ComponentClassifier: """Context-aware component classifier for CODITECT framework."""

def __init__(self, coditect_root: Path):
"""
Initialize the classifier with component directories.

Args:
coditect_root: Path to coditect-core root directory
"""
self.root = Path(coditect_root)
self.skills_dir = self.root / "skills"
self.agents_dir = self.root / "agents"
self.commands_dir = self.root / "commands"
self.hooks_dir = self.root / "hooks"

# Load actual component names from filesystem
self._actual_skills = self._load_component_names(self.skills_dir, is_skill=True)
self._actual_agents = self._load_component_names(self.agents_dir)
self._actual_commands = self._load_component_names(self.commands_dir)
self._actual_hooks = self._load_component_names(self.hooks_dir, is_hook=True)

# Initialize pattern regexes
self._init_patterns()

def _load_component_names(self, directory: Path, is_skill: bool = False,
is_hook: bool = False) -> Set[str]:
"""Load component names from filesystem."""
if not directory.exists():
return set()

names = set()
if is_skill:
# Skills are directories containing SKILL.md
for item in directory.iterdir():
if item.is_dir() and (item / "SKILL.md").exists():
names.add(item.name.lower())
elif is_hook:
# Hooks are .py files
for item in directory.glob("*.py"):
names.add(item.stem.lower()) # filename without .py
else:
# Agents and commands are .md files
for item in directory.glob("*.md"):
names.add(item.stem.lower()) # filename without .md

return names

def _init_patterns(self):
"""Initialize pattern detection regexes."""
# Pattern detection regexes with type hints
# Each tuple: (regex_pattern, type_hint)
# type_hint: 'command', 'skill', 'agent', 'hook', or None (ambiguous)
self.invocation_patterns = [
# COMMAND patterns (high confidence)
(r'<command-name>/?(\w+[-\w]*)', 'command'),
(r'running\s+/(\w+[-\w]*)', 'command'),
(r'execute\s+/(\w+[-\w]*)', 'command'),
(r'invoke\s+/(\w+[-\w]*)', 'command'),

# AGENT patterns (high confidence)
(r'/agent\s+(\w+[-\w]*)', 'agent'),
(r'subagent_type["\']?\s*[:=]\s*["\']?(\w+[-\w]*)', 'agent'),
(r'Task\s*\([^)]*subagent_type[^)]*["\'](\w+[-\w]*)["\']', 'agent'),
(r'launching\s+agent[:\s]+(\w+[-\w]*)', 'agent'),

# SKILL patterns (high confidence)
(r'Use\s+(?:the\s+)?(\w+[-\w]*)\s+skill', 'skill'), # "Use X skill" or "Use the X skill"
(r'skill:\s*["\']?(\w+[-\w]*)["\']?', 'skill'),
(r'TOOL_USE:\s*Skill.*?skill.*?["\'](\w+[-\w]*)["\']', 'skill'),
(r'Launching\s+skill:\s*(\w+[-\w]*)', 'skill'),
(r'skills/(\w+[-\w]*)/', 'skill'),

# HOOK patterns (high confidence)
(r'hooks/(\w+[-\w]*)\.py', 'hook'),
(r'PostToolUse[:\s]+(\w+[-\w]*)', 'hook'),
(r'PreToolUse[:\s]+(\w+[-\w]*)', 'hook'),

# AMBIGUOUS patterns (need context analysis)
(r'/(\w+[-\w]*)', None),
(r'invoking\s+(\w+[-\w]*)', None),
]

# Components to ignore (common words, paths, etc.)
self.ignore_names = {
# Common words
'the', 'a', 'an', 'to', 'for', 'with', 'and', 'or', 'in', 'on',
# Meta-keywords (often matched but not actual components)
'agent', 'skill', 'command', 'hook', 'task', 'tool',
'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has',
'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may',
'might', 'must', 'shall', 'can', 'need', 'dare', 'ought', 'used',
'true', 'false', 'null', 'none', 'undefined', 'this', 'that',
# Path components
'users', 'halcasteel', 'projects', 'coditect-rollout-master',
'submodules', 'core', 'coditect-core', 'docs', 'internal',
'scripts', 'hooks', 'agents', 'commands', 'skills', 'config',
'context-storage', 'reference', 'reports', 'archive', 'tools',
'usr', 'env', 'home', 'opt', 'var', 'tmp', 'etc',
# File extensions
'md', 'py', 'json', 'yaml', 'yml', 'txt', 'sh', 'js', 'ts',
# Numbers and single chars
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
}

def classify_from_filesystem(self, name: str) -> str:
"""
Classify a component name using filesystem lookup only.

Args:
name: Component name to classify

Returns:
Component type: 'skill', 'agent', 'command', 'hook', or 'unknown'
"""
name_lower = name.lower()
if name_lower in self._actual_skills:
return 'skill'
if name_lower in self._actual_agents:
return 'agent'
if name_lower in self._actual_commands:
return 'command'
if name_lower in self._actual_hooks:
return 'hook'
return 'unknown'

def classify(self, name: str, context: str = "", pattern_hint: str = None) -> str:
"""
Context-aware component classification.

Uses multiple signals to determine component type:
1. Pattern hint from the regex that matched (highest confidence)
2. Surrounding context clues
3. Filesystem lookup (fallback)

Args:
name: Component name to classify
context: Surrounding text from transcript
pattern_hint: Optional hint from the matching pattern

Returns:
Component type: 'skill', 'agent', 'command', 'hook', or 'unknown'
"""
name_lower = name.lower()

# Skip ignored names
if name_lower in self.ignore_names:
return 'unknown'

# 1. PATTERN HINT (highest confidence)
if pattern_hint:
# Verify the hint against filesystem
if pattern_hint == 'agent' and name_lower in self._actual_agents:
return 'agent'
if pattern_hint == 'command' and name_lower in self._actual_commands:
return 'command'
if pattern_hint == 'skill' and name_lower in self._actual_skills:
return 'skill'
if pattern_hint == 'hook' and name_lower in self._actual_hooks:
return 'hook'
# Pattern hint exists but not in filesystem - check if exists elsewhere
fs_type = self.classify_from_filesystem(name_lower)
if fs_type != 'unknown':
return fs_type
# Trust the pattern hint even if not in filesystem
if pattern_hint in ('agent', 'command', 'skill', 'hook'):
return pattern_hint

# 2. CONTEXT-BASED CLASSIFICATION
if context:
context_lower = context.lower()

# Agent invocation patterns
agent_patterns = [
rf'/agent\s+{re.escape(name_lower)}',
rf'agent["\s:]+{re.escape(name_lower)}',
rf'subagent_type["\s:=]+{re.escape(name_lower)}',
rf'Task\s*\([^)]*{re.escape(name_lower)}',
rf'launching\s+agent[:\s]+{re.escape(name_lower)}',
]

# Command invocation patterns (avoid matching file paths)
command_patterns = [
rf'(?:^|\s)/{re.escape(name_lower)}\b',
rf'<command-name>/?{re.escape(name_lower)}',
rf'running\s+/{re.escape(name_lower)}',
rf'execute\s+/{re.escape(name_lower)}',
rf'commands/{re.escape(name_lower)}\.md',
]

# Skill invocation patterns
skill_patterns = [
rf'skill["\s:]+{re.escape(name_lower)}',
rf'use\s+{re.escape(name_lower)}\s+skill',
rf'TOOL_USE:\s*Skill[^}}]*{re.escape(name_lower)}',
rf'skills/{re.escape(name_lower)}/',
]

# Hook invocation patterns
hook_patterns = [
rf'hooks/{re.escape(name_lower)}\.py',
rf'hook[:\s]+{re.escape(name_lower)}',
rf'PostToolUse[:\s]+{re.escape(name_lower)}',
rf'PreToolUse[:\s]+{re.escape(name_lower)}',
]

# Count matches
scores = {
'agent': sum(1 for p in agent_patterns if re.search(p, context_lower)),
'command': sum(1 for p in command_patterns if re.search(p, context_lower)),
'skill': sum(1 for p in skill_patterns if re.search(p, context_lower)),
'hook': sum(1 for p in hook_patterns if re.search(p, context_lower)),
}

max_score = max(scores.values())
if max_score > 0:
for comp_type, score in scores.items():
if score == max_score:
# Verify against filesystem
if comp_type == 'agent' and name_lower in self._actual_agents:
return 'agent'
if comp_type == 'command' and name_lower in self._actual_commands:
return 'command'
if comp_type == 'skill' and name_lower in self._actual_skills:
return 'skill'
if comp_type == 'hook' and name_lower in self._actual_hooks:
return 'hook'
# Trust context if score >= 2
if max_score >= 2:
return comp_type

# 3. FILESYSTEM LOOKUP (fallback)
return self.classify_from_filesystem(name_lower)

def extract_invocations(self, text: str, window_size: int = 200) -> List[Dict]:
"""
Extract component invocations from text with classification.

Args:
text: Text to search for invocations
window_size: Characters of context to include around each match

Returns:
List of dicts with 'name', 'type', 'context', 'pattern_hint'
"""
invocations = []
seen = set() # Deduplicate

for pattern, type_hint in self.invocation_patterns:
for match in re.finditer(pattern, text, re.IGNORECASE):
name = match.group(1)
name_lower = name.lower()

# Skip ignored names
if name_lower in self.ignore_names:
continue

# Skip if already seen
if name_lower in seen:
continue
seen.add(name_lower)

# Get context window
start = max(0, match.start() - window_size)
end = min(len(text), match.end() + window_size)
context = text[start:end]

# Classify with context
comp_type = self.classify(name, context, type_hint)

# Only include if it's a real component (not unknown from ignored)
if comp_type != 'unknown' or self._is_valid_component_name(name_lower):
invocations.append({
'name': name_lower,
'type': comp_type,
'context': context,
'pattern_hint': type_hint,
'match_position': match.start(),
})

return invocations

def _is_valid_component_name(self, name: str) -> bool:
"""Check if a name could be a valid component name."""
# Must be at least 2 characters
if len(name) < 2:
return False
# Must not be just numbers
if name.isdigit():
return False
# Must contain at least one letter
if not any(c.isalpha() for c in name):
return False
return True

def get_component_counts(self) -> Dict[str, int]:
"""Get counts of components by type from filesystem."""
return {
'skills': len(self._actual_skills),
'agents': len(self._actual_agents),
'commands': len(self._actual_commands),
'hooks': len(self._actual_hooks),
}

def is_known_component(self, name: str) -> bool:
"""Check if a name is a known component in any category."""
name_lower = name.lower()
return (name_lower in self._actual_skills or
name_lower in self._actual_agents or
name_lower in self._actual_commands or
name_lower in self._actual_hooks)

Singleton instance for convenience

_classifier_instance: Optional[ComponentClassifier] = None

def get_classifier(coditect_root: Path = None) -> ComponentClassifier: """ Get or create the singleton classifier instance.

Args:
coditect_root: Path to coditect-core root (required on first call)

Returns:
ComponentClassifier instance
"""
global _classifier_instance

if _classifier_instance is None:
if coditect_root is None:
# Try to find coditect root
coditect_root = Path(__file__).parent.parent.parent
_classifier_instance = ComponentClassifier(coditect_root)

return _classifier_instance

if name == "main": # Test the classifier import sys

root = Path(__file__).parent.parent.parent
classifier = ComponentClassifier(root)

print("Component Classifier Test")
print("=" * 60)
print(f"Components loaded: {classifier.get_component_counts()}")
print()

# Test cases
test_cases = [
("orient", "<command-name>orient", "command"),
("senior-architect", "/agent senior-architect", "agent"),
("git-sync", "Use git-sync skill", "skill"),
("session-retrospective", "hooks/session-retrospective.py", "hook"),
("classify", "running /classify", "command"),
("unknown-thing", "some random text", "unknown"),
]

print("Classification Tests:")
for name, context, expected in test_cases:
result = classifier.classify(name, context)
status = "PASS" if result == expected else "FAIL"
print(f" [{status}] {name}: expected={expected}, got={result}")

print()
print("Extraction Test:")
sample_text = """
Running /orient to start the session.
Then /agent senior-architect "review the code".
Using git-sync skill for repository sync.
The hook hooks/session-retrospective.py was triggered.
"""

invocations = classifier.extract_invocations(sample_text)
for inv in invocations:
print(f" - {inv['name']}: {inv['type']}")