scripts-component-classifier

#!/usr/bin/env python3 """

title: Component Classifier component_type: script version: 1.0.0 audience: contributor status: active summary: Shared context-aware component classification for CODITECT keywords:

classification
components
skills
agents
commands created: 2026-01-13 updated: 2026-01-13

Shared Component Classifier for CODITECT

Provides context-aware classification of component invocations (skills, agents, commands, hooks) used by:

session-retrospective.py (skill learning system)
unified-message-extractor.py (message storage system)
context-db.py (database indexing)

The classifier uses multiple signals:

Pattern hints from regex matches (highest confidence)
Surrounding context analysis
Filesystem lookup (fallback)

Usage: from component_classifier import ComponentClassifier

classifier = ComponentClassifier(coditect_root)
comp_type = classifier.classify("orient", context_text)
# Returns: 'command'

invocations = classifier.extract_invocations(transcript_text)
# Returns: [{'name': 'orient', 'type': 'command', 'context': '...'}, ...]

"""

import re from pathlib import Path from typing import Dict, List, Optional, Set, Tuple

class ComponentClassifier: """Context-aware component classifier for CODITECT framework."""

def __init__(self, coditect_root: Path):
    """
    Initialize the classifier with component directories.

    Args:
        coditect_root: Path to coditect-core root directory
    """
    self.root = Path(coditect_root)
    self.skills_dir = self.root / "skills"
    self.agents_dir = self.root / "agents"
    self.commands_dir = self.root / "commands"
    self.hooks_dir = self.root / "hooks"

    # Load actual component names from filesystem
    self._actual_skills = self._load_component_names(self.skills_dir, is_skill=True)
    self._actual_agents = self._load_component_names(self.agents_dir)
    self._actual_commands = self._load_component_names(self.commands_dir)
    self._actual_hooks = self._load_component_names(self.hooks_dir, is_hook=True)

    # Initialize pattern regexes
    self._init_patterns()

def _load_component_names(self, directory: Path, is_skill: bool = False,
                          is_hook: bool = False) -> Set[str]:
    """Load component names from filesystem."""
    if not directory.exists():
        return set()

    names = set()
    if is_skill:
        # Skills are directories containing SKILL.md
        for item in directory.iterdir():
            if item.is_dir() and (item / "SKILL.md").exists():
                names.add(item.name.lower())
    elif is_hook:
        # Hooks are .py files
        for item in directory.glob("*.py"):
            names.add(item.stem.lower())  # filename without .py
    else:
        # Agents and commands are .md files
        for item in directory.glob("*.md"):
            names.add(item.stem.lower())  # filename without .md

    return names

def _init_patterns(self):
    """Initialize pattern detection regexes."""
    # Pattern detection regexes with type hints
    # Each tuple: (regex_pattern, type_hint)
    # type_hint: 'command', 'skill', 'agent', 'hook', or None (ambiguous)
    self.invocation_patterns = [
        # COMMAND patterns (high confidence)
        (r'<command-name>/?(\w+[-\w]*)', 'command'),
        (r'running\s+/(\w+[-\w]*)', 'command'),
        (r'execute\s+/(\w+[-\w]*)', 'command'),
        (r'invoke\s+/(\w+[-\w]*)', 'command'),

        # AGENT patterns (high confidence)
        (r'/agent\s+(\w+[-\w]*)', 'agent'),
        (r'subagent_type["\']?\s*[:=]\s*["\']?(\w+[-\w]*)', 'agent'),
        (r'Task\s*\([^)]*subagent_type[^)]*["\'](\w+[-\w]*)["\']', 'agent'),
        (r'launching\s+agent[:\s]+(\w+[-\w]*)', 'agent'),

        # SKILL patterns (high confidence)
        (r'Use\s+(?:the\s+)?(\w+[-\w]*)\s+skill', 'skill'),  # "Use X skill" or "Use the X skill"
        (r'skill:\s*["\']?(\w+[-\w]*)["\']?', 'skill'),
        (r'TOOL_USE:\s*Skill.*?skill.*?["\'](\w+[-\w]*)["\']', 'skill'),
        (r'Launching\s+skill:\s*(\w+[-\w]*)', 'skill'),
        (r'skills/(\w+[-\w]*)/', 'skill'),

        # HOOK patterns (high confidence)
        (r'hooks/(\w+[-\w]*)\.py', 'hook'),
        (r'PostToolUse[:\s]+(\w+[-\w]*)', 'hook'),
        (r'PreToolUse[:\s]+(\w+[-\w]*)', 'hook'),

        # AMBIGUOUS patterns (need context analysis)
        (r'/(\w+[-\w]*)', None),
        (r'invoking\s+(\w+[-\w]*)', None),
    ]

    # Components to ignore (common words, paths, etc.)
    self.ignore_names = {
        # Common words
        'the', 'a', 'an', 'to', 'for', 'with', 'and', 'or', 'in', 'on',
        # Meta-keywords (often matched but not actual components)
        'agent', 'skill', 'command', 'hook', 'task', 'tool',
        'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has',
        'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may',
        'might', 'must', 'shall', 'can', 'need', 'dare', 'ought', 'used',
        'true', 'false', 'null', 'none', 'undefined', 'this', 'that',
        # Path components
        'users', 'halcasteel', 'projects', 'coditect-rollout-master',
        'submodules', 'core', 'coditect-core', 'docs', 'internal',
        'scripts', 'hooks', 'agents', 'commands', 'skills', 'config',
        'context-storage', 'reference', 'reports', 'archive', 'tools',
        'usr', 'env', 'home', 'opt', 'var', 'tmp', 'etc',
        # File extensions
        'md', 'py', 'json', 'yaml', 'yml', 'txt', 'sh', 'js', 'ts',
        # Numbers and single chars
        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
    }

def classify_from_filesystem(self, name: str) -> str:
    """
    Classify a component name using filesystem lookup only.

    Args:
        name: Component name to classify

    Returns:
        Component type: 'skill', 'agent', 'command', 'hook', or 'unknown'
    """
    name_lower = name.lower()
    if name_lower in self._actual_skills:
        return 'skill'
    if name_lower in self._actual_agents:
        return 'agent'
    if name_lower in self._actual_commands:
        return 'command'
    if name_lower in self._actual_hooks:
        return 'hook'
    return 'unknown'

def classify(self, name: str, context: str = "", pattern_hint: str = None) -> str:
    """
    Context-aware component classification.

    Uses multiple signals to determine component type:
    1. Pattern hint from the regex that matched (highest confidence)
    2. Surrounding context clues
    3. Filesystem lookup (fallback)

    Args:
        name: Component name to classify
        context: Surrounding text from transcript
        pattern_hint: Optional hint from the matching pattern

    Returns:
        Component type: 'skill', 'agent', 'command', 'hook', or 'unknown'
    """
    name_lower = name.lower()

    # Skip ignored names
    if name_lower in self.ignore_names:
        return 'unknown'

    # 1. PATTERN HINT (highest confidence)
    if pattern_hint:
        # Verify the hint against filesystem
        if pattern_hint == 'agent' and name_lower in self._actual_agents:
            return 'agent'
        if pattern_hint == 'command' and name_lower in self._actual_commands:
            return 'command'
        if pattern_hint == 'skill' and name_lower in self._actual_skills:
            return 'skill'
        if pattern_hint == 'hook' and name_lower in self._actual_hooks:
            return 'hook'
        # Pattern hint exists but not in filesystem - check if exists elsewhere
        fs_type = self.classify_from_filesystem(name_lower)
        if fs_type != 'unknown':
            return fs_type
        # Trust the pattern hint even if not in filesystem
        if pattern_hint in ('agent', 'command', 'skill', 'hook'):
            return pattern_hint

    # 2. CONTEXT-BASED CLASSIFICATION
    if context:
        context_lower = context.lower()

        # Agent invocation patterns
        agent_patterns = [
            rf'/agent\s+{re.escape(name_lower)}',
            rf'agent["\s:]+{re.escape(name_lower)}',
            rf'subagent_type["\s:=]+{re.escape(name_lower)}',
            rf'Task\s*\([^)]*{re.escape(name_lower)}',
            rf'launching\s+agent[:\s]+{re.escape(name_lower)}',
        ]

        # Command invocation patterns (avoid matching file paths)
        command_patterns = [
            rf'(?:^|\s)/{re.escape(name_lower)}\b',
            rf'<command-name>/?{re.escape(name_lower)}',
            rf'running\s+/{re.escape(name_lower)}',
            rf'execute\s+/{re.escape(name_lower)}',
            rf'commands/{re.escape(name_lower)}\.md',
        ]

        # Skill invocation patterns
        skill_patterns = [
            rf'skill["\s:]+{re.escape(name_lower)}',
            rf'use\s+{re.escape(name_lower)}\s+skill',
            rf'TOOL_USE:\s*Skill[^}}]*{re.escape(name_lower)}',
            rf'skills/{re.escape(name_lower)}/',
        ]

        # Hook invocation patterns
        hook_patterns = [
            rf'hooks/{re.escape(name_lower)}\.py',
            rf'hook[:\s]+{re.escape(name_lower)}',
            rf'PostToolUse[:\s]+{re.escape(name_lower)}',
            rf'PreToolUse[:\s]+{re.escape(name_lower)}',
        ]

        # Count matches
        scores = {
            'agent': sum(1 for p in agent_patterns if re.search(p, context_lower)),
            'command': sum(1 for p in command_patterns if re.search(p, context_lower)),
            'skill': sum(1 for p in skill_patterns if re.search(p, context_lower)),
            'hook': sum(1 for p in hook_patterns if re.search(p, context_lower)),
        }

        max_score = max(scores.values())
        if max_score > 0:
            for comp_type, score in scores.items():
                if score == max_score:
                    # Verify against filesystem
                    if comp_type == 'agent' and name_lower in self._actual_agents:
                        return 'agent'
                    if comp_type == 'command' and name_lower in self._actual_commands:
                        return 'command'
                    if comp_type == 'skill' and name_lower in self._actual_skills:
                        return 'skill'
                    if comp_type == 'hook' and name_lower in self._actual_hooks:
                        return 'hook'
                    # Trust context if score >= 2
                    if max_score >= 2:
                        return comp_type

    # 3. FILESYSTEM LOOKUP (fallback)
    return self.classify_from_filesystem(name_lower)

def extract_invocations(self, text: str, window_size: int = 200) -> List[Dict]:
    """
    Extract component invocations from text with classification.

    Args:
        text: Text to search for invocations
        window_size: Characters of context to include around each match

    Returns:
        List of dicts with 'name', 'type', 'context', 'pattern_hint'
    """
    invocations = []
    seen = set()  # Deduplicate

    for pattern, type_hint in self.invocation_patterns:
        for match in re.finditer(pattern, text, re.IGNORECASE):
            name = match.group(1)
            name_lower = name.lower()

            # Skip ignored names
            if name_lower in self.ignore_names:
                continue

            # Skip if already seen
            if name_lower in seen:
                continue
            seen.add(name_lower)

            # Get context window
            start = max(0, match.start() - window_size)
            end = min(len(text), match.end() + window_size)
            context = text[start:end]

            # Classify with context
            comp_type = self.classify(name, context, type_hint)

            # Only include if it's a real component (not unknown from ignored)
            if comp_type != 'unknown' or self._is_valid_component_name(name_lower):
                invocations.append({
                    'name': name_lower,
                    'type': comp_type,
                    'context': context,
                    'pattern_hint': type_hint,
                    'match_position': match.start(),
                })

    return invocations

def _is_valid_component_name(self, name: str) -> bool:
    """Check if a name could be a valid component name."""
    # Must be at least 2 characters
    if len(name) < 2:
        return False
    # Must not be just numbers
    if name.isdigit():
        return False
    # Must contain at least one letter
    if not any(c.isalpha() for c in name):
        return False
    return True

def get_component_counts(self) -> Dict[str, int]:
    """Get counts of components by type from filesystem."""
    return {
        'skills': len(self._actual_skills),
        'agents': len(self._actual_agents),
        'commands': len(self._actual_commands),
        'hooks': len(self._actual_hooks),
    }

def is_known_component(self, name: str) -> bool:
    """Check if a name is a known component in any category."""
    name_lower = name.lower()
    return (name_lower in self._actual_skills or
            name_lower in self._actual_agents or
            name_lower in self._actual_commands or
            name_lower in self._actual_hooks)

Singleton instance for convenience

_classifier_instance: Optional[ComponentClassifier] = None

def get_classifier(coditect_root: Path = None) -> ComponentClassifier: """ Get or create the singleton classifier instance.

Args:
    coditect_root: Path to coditect-core root (required on first call)

Returns:
    ComponentClassifier instance
"""
global _classifier_instance

if _classifier_instance is None:
    if coditect_root is None:
        # Try to find coditect root
        coditect_root = Path(__file__).parent.parent.parent
    _classifier_instance = ComponentClassifier(coditect_root)

return _classifier_instance

if name == "main": # Test the classifier import sys

root = Path(__file__).parent.parent.parent
classifier = ComponentClassifier(root)

print("Component Classifier Test")
print("=" * 60)
print(f"Components loaded: {classifier.get_component_counts()}")
print()

# Test cases
test_cases = [
    ("orient", "<command-name>orient", "command"),
    ("senior-architect", "/agent senior-architect", "agent"),
    ("git-sync", "Use git-sync skill", "skill"),
    ("session-retrospective", "hooks/session-retrospective.py", "hook"),
    ("classify", "running /classify", "command"),
    ("unknown-thing", "some random text", "unknown"),
]

print("Classification Tests:")
for name, context, expected in test_cases:
    result = classifier.classify(name, context)
    status = "PASS" if result == expected else "FAIL"
    print(f"  [{status}] {name}: expected={expected}, got={result}")

print()
print("Extraction Test:")
sample_text = """
Running /orient to start the session.
Then /agent senior-architect "review the code".
Using git-sync skill for repository sync.
The hook hooks/session-retrospective.py was triggered.
"""

invocations = classifier.extract_invocations(sample_text)
for inv in invocations:
    print(f"  - {inv['name']}: {inv['type']}")

#!/usr/bin/env python3 """​

Singleton instance for convenience

#!/usr/bin/env python3 """