scripts-extract-framework-metadata

#!/usr/bin/env python3 """

title: "Extract Framework Metadata" component_type: script version: "1.0.0" audience: contributor status: stable summary: "Framework Metadata Extraction Script" keywords: ['analysis', 'automation', 'deployment', 'extract', 'framework'] tokens: ~500 created: 2025-12-22 updated: 2025-12-22 script_name: "extract-framework-metadata.py" language: python executable: true usage: "python3 scripts/extract-framework-metadata.py [options]" python_version: "3.10+" dependencies: [] modifies_files: false network_access: false requires_auth: false

Framework Metadata Extraction Script

Extracts metadata from all CODITECT components (agents, skills, commands, scripts) and generates the framework-registry.json file for LLM framework awareness.

Phase 2C: Framework Knowledge Registration System

Usage: python3 scripts/extract-framework-metadata.py

Output: - .coditect/config/framework-registry.json - .coditect/config/agents/ (individual agent metadata files) - .coditect/config/skills/ (individual skill metadata files) - .coditect/config/commands/ (individual command metadata files) - .coditect/config/scripts/ (individual script metadata files)

import argparse import json import re from pathlib import Path from typing import Dict, List, Optional, Any from datetime import datetime

def parse_args(): """Parse command line arguments""" parser = argparse.ArgumentParser( description='Extract metadata from all CODITECT components and generate framework-registry.json.', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=''' Examples: %(prog)s # Extract all component metadata %(prog)s --dry-run # Preview without writing files %(prog)s --verbose # Show detailed extraction info %(prog)s --agents-only # Only extract agent metadata %(prog)s --json # Output summary as JSON

Output:

.coditect/config/framework-registry.json
.coditect/config/agents/ (individual agent metadata files)
.coditect/config/skills/ (individual skill metadata files)
.coditect/config/commands/ (individual command metadata files)
.coditect/config/scripts/ (individual script metadata files)

Part of CODITECT Phase 2C: Framework Knowledge Registration System ''' ) parser.add_argument('--dry-run', action='store_true', help='Preview extraction without writing files') parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output') parser.add_argument('--agents-only', action='store_true', help='Only extract agent metadata') parser.add_argument('--skills-only', action='store_true', help='Only extract skill metadata') parser.add_argument('--commands-only', action='store_true', help='Only extract command metadata') parser.add_argument('--scripts-only', action='store_true', help='Only extract script metadata') parser.add_argument('--json', action='store_true', help='Output summary as JSON') return parser.parse_args()

def parse_simple_yaml(yaml_str: str) -> Dict[str, Any]: """Simple YAML parser for basic key-value pairs and lists.""" result = {} current_key = None current_list = []

for line in yaml_str.split('\n'):
    line = line.rstrip()

    # Skip empty lines and comments
    if not line or line.startswith('#'):
        continue

    # List item
    if line.strip().startswith('-') and current_key:
        item = line.strip()[1:].strip()
        current_list.append(item)
        continue

    # Key-value pair
    if ':' in line and not line.startswith(' '):
        # Save previous list if exists
        if current_key and current_list:
            result[current_key] = current_list
            current_list = []

        key, value = line.split(':', 1)
        key = key.strip()
        value = value.strip()

        # Check if this is a list key
        if not value:
            current_key = key
            continue

        # Handle different value types
        if value.lower() in ('true', 'false'):
            result[key] = value.lower() == 'true'
        elif value.startswith('[') and value.endswith(']'):
            # Simple list parsing
            result[key] = [v.strip().strip('"').strip("'") for v in value[1:-1].split(',') if v.strip()]
        else:
            # String value
            result[key] = value.strip('"').strip("'")
    elif current_key and line.startswith('  '):
        # Nested item (treat as part of current context)
        continue

# Save final list if exists
if current_key and current_list:
    result[current_key] = current_list

return result

def parse_frontmatter(content: str) -> tuple[Dict[str, Any], str]: """Parse YAML frontmatter from markdown content.""" if not content.startswith('---'): return {}, content

parts = content.split('---', 2)
if len(parts) < 3:
    return {}, content

try:
    metadata = parse_simple_yaml(parts[1])
    body = parts[2].strip()
    return metadata or {}, body
except Exception as e:
    print(f"⚠️  YAML parsing error: {e}")
    return {}, content

def extract_agent_metadata(agent_file: Path) -> Dict[str, Any]: """ Extract metadata from an agent markdown file.

Agent files have YAML frontmatter with:
- name: Agent ID
- description: One-line description
- tools: List of available tools
- model: LLM model to use
"""
try:
    with open(agent_file, 'r', encoding='utf-8') as f:
        file_content = f.read()

    metadata, content = parse_frontmatter(file_content)

    # Extract capabilities from content (first section after frontmatter)
    capabilities = []
    use_cases = []

    # Find "Core Responsibilities" or similar section
    responsibilities_match = re.search(
        r'## Core Responsibilities\s*\n(.*?)(?=\n##|\Z)',
        content,
        re.DOTALL
    )
    if responsibilities_match:
        resp_text = responsibilities_match.group(1)
        # Extract numbered or bulleted items
        capabilities = re.findall(r'(?:###|\*|-)\s*\*\*(.+?)\*\*', resp_text)

    # Find usage examples
    examples_match = re.search(
        r'## Usage Examples?\s*\n(.*?)(?=\n##|\Z)',
        content,
        re.DOTALL
    )
    if examples_match:
        examples_text = examples_match.group(1)
        # Extract usage patterns
        use_cases = re.findall(r'```[^\n]*\n(.+?)\n```', examples_text, re.DOTALL)
        use_cases = [uc.strip() for uc in use_cases if 'Use' in uc or 'Task(' in uc]

    # Determine category from context awareness DNA
    category = "general"
    try:
        context_awareness = metadata.get('context_awareness', {})
        if isinstance(context_awareness, dict):
            keywords = context_awareness.get('auto_scope_keywords', {})
            if isinstance(keywords, dict) and keywords:
                # Use first category as primary
                category = list(keywords.keys())[0]
            elif isinstance(keywords, list) and keywords:
                # Handle list format - use first keyword as category hint
                category = "general"
    except (AttributeError, TypeError):
        # Fallback for malformed context_awareness structures
        category = "general"

    # Extract typical invocation - use agent_file.stem as fallback for name
    agent_name = metadata.get("name") or agent_file.stem
    typical_invocation = f'Task(subagent_type="general-purpose", prompt="Use {agent_name} subagent to <task>")'
    if use_cases:
        # Use first use case as example
        typical_invocation = use_cases[0][:200] if len(use_cases[0]) < 200 else use_cases[0][:197] + "..."

    # Extract tags
    tags = []
    try:
        context_awareness = metadata.get('context_awareness', {})
        if isinstance(context_awareness, dict):
            confidence_boosters = context_awareness.get('confidence_boosters', [])
            if isinstance(confidence_boosters, list):
                for booster in confidence_boosters:
                    if isinstance(booster, str):
                        # Extract words from confidence boosters
                        words = re.findall(r'\w+', booster.lower())
                        tags.extend(words[:3])  # Take first 3 words
    except (AttributeError, TypeError):
        pass  # Keep tags empty on parsing errors

    tags = list(set(tags))[:5]  # Deduplicate and limit to 5 tags

    return {
        "id": metadata.get("name", agent_file.stem),
        "name": metadata.get("name", agent_file.stem).replace('-', ' ').title(),
        "category": category,
        "description": metadata.get("description", ""),
        "capabilities": capabilities[:5],  # Limit to top 5
        "use_cases": use_cases[:3],  # Limit to top 3
        "typical_invocation": typical_invocation,
        "llm_binding": {
            "provider": "anthropic-claude",
            "model": metadata.get("model", "sonnet"),
            "temperature": 0.7,
            "max_tokens": 4096
        },
        "tools": metadata.get("tools", []),
        "tags": tags,
        "metadata": {
            "automation_features": metadata.get("automation_features", {}),
            "progress_checkpoints": metadata.get("progress_checkpoints", {}),
        }
    }
except Exception as e:
    print(f"⚠️  Error extracting agent metadata from {agent_file}: {e}")
    return None

def extract_skill_metadata(skill_dir: Path) -> Dict[str, Any]: """ Extract metadata from a skill directory.

Skill directories contain:
- SKILL.md: Main skill documentation with frontmatter
- core/: Implementation files
- templates/: Reusable templates
"""
skill_md = skill_dir / "SKILL.md"
if not skill_md.exists():
    return None

try:
    with open(skill_md, 'r', encoding='utf-8') as f:
        file_content = f.read()

    metadata, content = parse_frontmatter(file_content)

    # Extract what the skill provides
    provides = []
    when_to_use_match = re.search(
        r'## When to Use\s*\n.*?✅.*?when:.*?\n(.*?)(?=\n##|❌)',
        content,
        re.DOTALL
    )
    if when_to_use_match:
        provides_text = when_to_use_match.group(1)
        provides = re.findall(r'(?:-|\*)\s*(.+?)(?:\n|$)', provides_text)
        provides = [p.strip() for p in provides if p.strip()][:5]

    # Extract use cases
    use_cases = []
    if provides:
        use_cases = provides[:3]  # Use top 3 as use cases

    # Extract tags from metadata
    tags = []
    if 'metadata' in metadata:
        tech_stack = metadata['metadata'].get('tech-stack', '')
        if tech_stack:
            tags = [t.strip() for t in tech_stack.split(',')][:5]

    return {
        "id": metadata.get("name", skill_dir.name),
        "name": metadata.get("name", skill_dir.name).replace('-', ' ').title(),
        "description": metadata.get("description", ""),
        "provides": provides,
        "use_cases": use_cases,
        "activation": f'Skill(skill="{metadata.get("name", skill_dir.name)}")',
        "tags": tags,
        "metadata": {
            "license": metadata.get("license", "MIT"),
            "allowed_tools": metadata.get("allowed-tools", []),
            "token_efficiency": metadata.get("metadata", {}).get("token-efficiency", ""),
            "integration": metadata.get("metadata", {}).get("integration", ""),
        }
    }
except Exception as e:
    print(f"⚠️  Error extracting skill metadata from {skill_dir}: {e}")
    return None

def extract_command_metadata(command_file: Path) -> Dict[str, Any]: """ Extract metadata from a command markdown file.

Command files describe slash commands with:
- Title/description
- Steps to follow
- Success criteria
- Usage examples
"""
try:
    with open(command_file, 'r', encoding='utf-8') as f:
        content = f.read()

    # Extract title/description (first heading)
    title_match = re.search(r'^# (.+?)$', content, re.MULTILINE)
    title = title_match.group(1) if title_match else command_file.stem

    # Extract description (first paragraph after title)
    desc_match = re.search(
        r'^# .+?\n\n(.+?)(?:\n\n|\n##)',
        content,
        re.DOTALL | re.MULTILINE
    )
    description = desc_match.group(1).strip() if desc_match else ""
    description = description.replace('\n', ' ')[:200]

    # Extract workflow steps
    workflow = []
    steps_matches = re.findall(r'### Step \d+: (.+?)$', content, re.MULTILINE)
    workflow = steps_matches[:5]  # Limit to 5 steps

    # Extract agents invoked
    agents_invoked = []
    task_matches = re.findall(r'Use (\w+-?\w+-?\w*) subagent', content)
    agents_invoked = list(set(task_matches))[:5]

    # Generate syntax
    command_name = f"/{command_file.stem}"
    syntax = f'{command_name} "<description>"'

    # Extract example
    example_match = re.search(r'```(?:bash)?\n(/\w+.+?)$', content, re.MULTILINE)
    example = example_match.group(1) if example_match else syntax

    # Extract tags from filename and content
    tags = []
    if 'project' in command_file.stem or 'new' in command_file.stem:
        tags.append('project-creation')
    if 'deploy' in command_file.stem or 'build' in command_file.stem:
        tags.append('deployment')
    if 'analyze' in command_file.stem or 'research' in command_file.stem:
        tags.append('analysis')
    if 'hook' in command_file.stem:
        tags.append('automation')

    tags.append('workflow')

    return {
        "id": command_file.stem,
        "name": command_name,
        "description": description,
        "syntax": syntax,
        "workflow": workflow,
        "example": example,
        "typical_duration": "5-10 minutes",  # Default
        "agents_invoked": agents_invoked,
        "tags": tags[:5]
    }
except Exception as e:
    print(f"⚠️  Error extracting command metadata from {command_file}: {e}")
    return None

def extract_script_metadata(script_file: Path) -> Dict[str, Any]: """ Extract metadata from a Python script file.

Scripts have docstrings with:
- Description
- Usage
- Author/copyright
"""
try:
    with open(script_file, 'r', encoding='utf-8') as f:
        content = f.read()

    # Extract module docstring
    docstring_match = re.search(r'"""(.*?)"""', content, re.DOTALL)
    docstring = docstring_match.group(1).strip() if docstring_match else ""

    # Extract description (first paragraph)
    desc_lines = []
    for line in docstring.split('\n'):
        line = line.strip()
        if not line or line.startswith('Usage:') or line.startswith('Author:'):
            break
        desc_lines.append(line)

    description = ' '.join(desc_lines)[:200]

    # Extract usage
    usage_match = re.search(r'Usage:\s*\n\s*(.+?)$', docstring, re.MULTILINE)
    usage = usage_match.group(1).strip() if usage_match else f"python3 scripts/{script_file.name}"

    # Extract arguments from usage
    arguments = []
    arg_matches = re.findall(r'(?:"<(.+?)>"|--(\w+))', usage)
    for arg_match in arg_matches:
        if arg_match[0]:  # Positional argument
            arguments.append({
                "name": arg_match[0],
                "required": True,
                "type": "string"
            })
        elif arg_match[1]:  # Flag argument
            arguments.append({
                "name": f"--{arg_match[1]}",
                "required": False,
                "type": "flag"
            })

    # Extract tags from script purpose
    tags = []
    if 'checkpoint' in script_file.stem or 'memory' in description.lower():
        tags.append('memory')
    if 'git' in script_file.stem or 'commit' in description.lower():
        tags.append('git')
    if 'submodule' in script_file.stem:
        tags.append('submodule')
    if 'setup' in script_file.stem or 'bootstrap' in script_file.stem:
        tags.append('setup')

    tags.append('automation')

    return {
        "id": script_file.stem,
        "name": script_file.name,
        "path": f"scripts/{script_file.name}",
        "description": description,
        "usage": usage,
        "arguments": arguments,
        "output": "See script documentation",
        "tags": tags[:5]
    }
except Exception as e:
    print(f"⚠️  Error extracting script metadata from {script_file}: {e}")
    return None

def main(): """Main extraction workflow.""" args = parse_args()

print("🔍 Framework Metadata Extraction - Phase 2C")
print("=" * 60)

# Find project root
project_root = Path(__file__).parent.parent

# Create output directories
config_dir = project_root / ".coditect" / "config"
config_dir.mkdir(parents=True, exist_ok=True)

(config_dir / "agents").mkdir(exist_ok=True)
(config_dir / "skills").mkdir(exist_ok=True)
(config_dir / "commands").mkdir(exist_ok=True)
(config_dir / "scripts").mkdir(exist_ok=True)

# Extract agent metadata
print("\n📋 Extracting agent metadata...")
agents_dir = project_root / "agents"
agent_files = [f for f in agents_dir.glob("*.md") if f.name != "README.md"]

agents_by_category = {}
for agent_file in agent_files:
    agent_metadata = extract_agent_metadata(agent_file)
    if agent_metadata:
        category = agent_metadata.get("category", "general")
        if category not in agents_by_category:
            agents_by_category[category] = []
        agents_by_category[category].append(agent_metadata)

        # Write individual agent metadata file
        agent_json_file = config_dir / "agents" / f"{agent_metadata['id']}.json"
        with open(agent_json_file, 'w') as f:
            json.dump(agent_metadata, f, indent=2)

print(f"   ✅ Extracted {len(agent_files)} agents across {len(agents_by_category)} categories")

# Extract skill metadata
print("\n📋 Extracting skill metadata...")
skills_dir = project_root / "skills"
skill_dirs = [d for d in skills_dir.iterdir() if d.is_dir() and (d / "SKILL.md").exists()]

skills = []
for skill_dir in skill_dirs:
    skill_metadata = extract_skill_metadata(skill_dir)
    if skill_metadata:
        skills.append(skill_metadata)

        # Write individual skill metadata file
        skill_json_file = config_dir / "skills" / f"{skill_metadata['id']}.json"
        with open(skill_json_file, 'w') as f:
            json.dump(skill_metadata, f, indent=2)

print(f"   ✅ Extracted {len(skills)} skills")

# Extract command metadata
print("\n📋 Extracting command metadata...")
commands_dir = project_root / "commands"
command_files = list(commands_dir.glob("*.md"))

commands_by_category = {
    "project_creation": [],
    "development": [],
    "analysis": [],
    "deployment": [],
    "other": []
}

for command_file in command_files:
    command_metadata = extract_command_metadata(command_file)
    if command_metadata:
        # Categorize command
        tags = command_metadata.get("tags", [])
        if "project-creation" in tags:
            category = "project_creation"
        elif "deployment" in tags:
            category = "deployment"
        elif "analysis" in tags:
            category = "analysis"
        elif any(tag in tags for tag in ["development", "implement", "code"]):
            category = "development"
        else:
            category = "other"

        commands_by_category[category].append(command_metadata)

        # Write individual command metadata file
        command_json_file = config_dir / "commands" / f"{command_metadata['id']}.json"
        with open(command_json_file, 'w') as f:
            json.dump(command_metadata, f, indent=2)

print(f"   ✅ Extracted {len(command_files)} commands across {len(commands_by_category)} categories")

# Extract script metadata
print("\n📋 Extracting script metadata...")
scripts_dir = project_root / "scripts"
script_files = list(scripts_dir.glob("*.py"))

scripts = []
for script_file in script_files:
    script_metadata = extract_script_metadata(script_file)
    if script_metadata:
        scripts.append(script_metadata)

        # Write individual script metadata file
        script_json_file = config_dir / "scripts" / f"{script_metadata['id']}.json"
        with open(script_json_file, 'w') as f:
            json.dump(script_metadata, f, indent=2)

print(f"   ✅ Extracted {len(scripts)} scripts")

# Create framework registry
print("\n📦 Creating framework registry...")

registry = {
    "framework_version": "1.0.0",
    "last_updated": datetime.now().isoformat(),
    "components": {
        "agents": {
            "total": len(agent_files),
            "categories": agents_by_category
        },
        "skills": {
            "total": len(skills),
            "list": skills
        },
        "commands": {
            "total": len(command_files),
            "categories": commands_by_category
        },
        "scripts": {
            "total": len(scripts),
            "list": scripts
        }
    }
}

registry_file = config_dir / "framework-registry.json"
with open(registry_file, 'w') as f:
    json.dump(registry, f, indent=2)

print(f"   ✅ Framework registry created: {registry_file}")

# Summary
print("\n" + "=" * 60)
print("✅ Metadata Extraction Complete!")
print(f"   📊 Total components: {len(agent_files) + len(skills) + len(command_files) + len(scripts)}")
print(f"   👤 Agents: {len(agent_files)}")
print(f"   🎯 Skills: {len(skills)}")
print(f"   ⚡ Commands: {len(command_files)}")
print(f"   🔧 Scripts: {len(scripts)}")
print(f"\n   📁 Output: {config_dir}")
print("=" * 60)

if name == "main": main()

#!/usr/bin/env python3 """​

#!/usr/bin/env python3 """