scripts-component-frontmatter-indexer

#!/usr/bin/env python3 """

title: "Component Frontmatter Indexer" component_type: script version: "1.0.0" audience: contributor status: stable summary: "Index component YAML frontmatter into context database for /cxq search" keywords: ['component', 'database', 'frontmatter', 'indexer', 'search'] tokens: ~500 created: 2025-12-22 updated: 2025-12-22 script_name: "component-frontmatter-indexer.py" language: python executable: true usage: "python3 scripts/component-frontmatter-indexer.py [options]" python_version: "3.10+" dependencies: [] modifies_files: true network_access: false requires_auth: false

CODITECT Component Frontmatter Indexer

Indexes all component YAML frontmatter into the context database for searchability via /cxq. Integrates with the context extraction pipeline.

Usage: python3 scripts/component-frontmatter-indexer.py # Index all components python3 scripts/component-frontmatter-indexer.py --init # Initialize schema python3 scripts/component-frontmatter-indexer.py --stats # Show statistics python3 scripts/component-frontmatter-indexer.py --search "keyword" # Search python3 scripts/component-frontmatter-indexer.py --type agent # Index specific type

Author: AZ1.AI INC Version: 1.0.0 ADR: ADR-018-AGENTIC-DOCUMENTATION-STANDARD """

import argparse import hashlib import json import re import sqlite3 import sys from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List, Optional, Tuple

class ComponentFrontmatterIndexer: """Index component frontmatter into the context database."""

SCHEMA_VERSION = "1.0.0"

# Component discovery patterns
COMPONENT_PATTERNS = {
    'agent': ['agents/*.md'],
    'command': ['commands/*.md'],
    'skill': ['skills/*/SKILL.md', 'skills/*/*.md'],
    'script': ['scripts/*.py'],
    'hook': ['hooks/*.py', 'hooks/*.md'],
    'prompt': ['prompts/*.md'],
    'workflow': ['docs/workflows/*.md'],
    'guide': ['docs/guides/*.md', 'docs/getting-started/*.md'],
    'reference': ['docs/reference/*.md'],
    # ADR-213: ADRs now in coditect-documentation; keep local path as fallback
    'adr': ['../../../docs/coditect-documentation/coditect-core/adrs/*.md',
            'internal/architecture/adrs/*.md']
}

def __init__(self, project_root: Path, db_path: Path):
    self.project_root = project_root
    self.db_path = db_path
    self.conn = None

def connect(self):
    """Connect to the database."""
    self.conn = sqlite3.connect(str(self.db_path))
    self.conn.row_factory = sqlite3.Row

def close(self):
    """Close the database connection."""
    if self.conn:
        self.conn.close()

def initialize_schema(self):
    """Create the component frontmatter tables."""
    self.connect()
    cursor = self.conn.cursor()

    # Main component_frontmatter table (separate from existing components registry)
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS component_frontmatter (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            path TEXT UNIQUE NOT NULL,
            hash TEXT NOT NULL,
            component_type TEXT NOT NULL,
            title TEXT,
            version TEXT,
            audience TEXT,
            status TEXT,
            summary TEXT,
            keywords TEXT,
            tokens INTEGER,
            created_date TEXT,
            updated_date TEXT,

            -- Agent-specific fields
            agent_type TEXT,
            domain TEXT,
            moe_role TEXT,
            moe_capabilities TEXT,
            invocation_pattern TEXT,

            -- Command-specific fields
            command_name TEXT,
            aliases TEXT,
            usage TEXT,

            -- Skill-specific fields
            skill_name TEXT,
            skill_category TEXT,
            when_to_use TEXT,
            composes_with TEXT,

            -- Script-specific fields
            script_name TEXT,
            language TEXT,
            dependencies TEXT,

            -- Metadata
            raw_frontmatter TEXT,
            indexed_at TEXT DEFAULT CURRENT_TIMESTAMP,
            conformance_level TEXT,

            -- Cross-references
            related_agents TEXT,
            related_commands TEXT,
            related_skills TEXT
        )
    """)

    # Full-text search index
    cursor.execute("""
        CREATE VIRTUAL TABLE IF NOT EXISTS component_frontmatter_fts USING fts5(
            path,
            title,
            summary,
            keywords,
            domain,
            moe_role,
            when_to_use,
            content='component_frontmatter',
            content_rowid='id'
        )
    """)

    # Triggers to keep FTS in sync
    cursor.execute("""
        CREATE TRIGGER IF NOT EXISTS component_frontmatter_ai AFTER INSERT ON component_frontmatter BEGIN
            INSERT INTO component_frontmatter_fts(rowid, path, title, summary, keywords, domain, moe_role, when_to_use)
            VALUES (new.id, new.path, new.title, new.summary, new.keywords, new.domain, new.moe_role, new.when_to_use);
        END
    """)

    cursor.execute("""
        CREATE TRIGGER IF NOT EXISTS component_frontmatter_ad AFTER DELETE ON component_frontmatter BEGIN
            INSERT INTO component_frontmatter_fts(component_frontmatter_fts, rowid, path, title, summary, keywords, domain, moe_role, when_to_use)
            VALUES ('delete', old.id, old.path, old.title, old.summary, old.keywords, old.domain, old.moe_role, old.when_to_use);
        END
    """)

    cursor.execute("""
        CREATE TRIGGER IF NOT EXISTS component_frontmatter_au AFTER UPDATE ON component_frontmatter BEGIN
            INSERT INTO component_frontmatter_fts(component_frontmatter_fts, rowid, path, title, summary, keywords, domain, moe_role, when_to_use)
            VALUES ('delete', old.id, old.path, old.title, old.summary, old.keywords, old.domain, old.moe_role, old.when_to_use);
            INSERT INTO component_frontmatter_fts(rowid, path, title, summary, keywords, domain, moe_role, when_to_use)
            VALUES (new.id, new.path, new.title, new.summary, new.keywords, new.domain, new.moe_role, new.when_to_use);
        END
    """)

    # Cross-reference table for relationships
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS component_frontmatter_relations (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            source_id INTEGER NOT NULL,
            target_id INTEGER NOT NULL,
            relation_type TEXT NOT NULL,
            created_at TEXT DEFAULT CURRENT_TIMESTAMP,
            FOREIGN KEY (source_id) REFERENCES component_frontmatter(id) ON DELETE CASCADE,
            FOREIGN KEY (target_id) REFERENCES component_frontmatter(id) ON DELETE CASCADE,
            UNIQUE(source_id, target_id, relation_type)
        )
    """)

    # Indexes
    cursor.execute("CREATE INDEX IF NOT EXISTS idx_cf_type ON component_frontmatter(component_type)")
    cursor.execute("CREATE INDEX IF NOT EXISTS idx_cf_moe ON component_frontmatter(moe_role)")
    cursor.execute("CREATE INDEX IF NOT EXISTS idx_cf_status ON component_frontmatter(status)")
    cursor.execute("CREATE INDEX IF NOT EXISTS idx_cf_domain ON component_frontmatter(domain)")

    # Schema version tracking
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS component_schema_info (
            key TEXT PRIMARY KEY,
            value TEXT
        )
    """)
    cursor.execute("""
        INSERT OR REPLACE INTO component_schema_info (key, value)
        VALUES ('version', ?), ('initialized_at', ?)
    """, (self.SCHEMA_VERSION, datetime.now(timezone.utc).isoformat()))

    self.conn.commit()
    print(f"✓ Schema initialized (version {self.SCHEMA_VERSION})")
    self.close()

def extract_frontmatter(self, file_path: Path) -> Optional[Dict[str, Any]]:
    """Extract YAML frontmatter from a file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        # Check for frontmatter delimiters
        if not content.startswith('---'):
            return None

        # Find closing delimiter
        lines = content.split('\n')
        end_idx = None
        for i, line in enumerate(lines[1:], 1):
            if line.strip() == '---':
                end_idx = i
                break

        if end_idx is None:
            return None

        # Parse YAML (simple parser for common cases)
        frontmatter_text = '\n'.join(lines[1:end_idx])
        frontmatter = self._parse_yaml(frontmatter_text)
        return frontmatter

    except Exception as e:
        return None

def _parse_yaml(self, text: str) -> Dict[str, Any]:
    """Simple YAML parser for frontmatter."""
    frontmatter = {}
    current_key = None
    current_list = None

    for line in text.split('\n'):
        line = line.rstrip()

        # Skip empty lines and comments
        if not line or line.strip().startswith('#'):
            continue

        # Check for list item
        if line.startswith('  - ') and current_key:
            if current_list is None:
                current_list = []
                frontmatter[current_key] = current_list
            current_list.append(line.strip()[2:].strip())
            continue

        # Check for key: value
        if ':' in line and not line.startswith(' '):
            current_list = None
            key, _, value = line.partition(':')
            key = key.strip()
            value = value.strip()
            current_key = key

            # Handle inline list [a, b, c]
            if value.startswith('[') and value.endswith(']'):
                items = value[1:-1].split(',')
                frontmatter[key] = [item.strip().strip('"\'') for item in items if item.strip()]
            elif value.startswith('"') and value.endswith('"'):
                frontmatter[key] = value[1:-1]
            elif value.startswith("'") and value.endswith("'"):
                frontmatter[key] = value[1:-1]
            elif value.lower() in ('true', 'false'):
                frontmatter[key] = value.lower() == 'true'
            elif value.startswith('~'):
                # Token estimate like ~2000
                try:
                    frontmatter[key] = int(value[1:])
                except:
                    frontmatter[key] = value
            elif value.isdigit():
                frontmatter[key] = int(value)
            elif not value:
                frontmatter[key] = None
            else:
                frontmatter[key] = value

    return frontmatter

def infer_component_type(self, file_path: Path) -> str:
    """Infer component type from file path."""
    path_str = str(file_path)

    if '/agents/' in path_str:
        return 'agent'
    elif '/commands/' in path_str:
        return 'command'
    elif '/skills/' in path_str:
        return 'skill'
    elif '/scripts/' in path_str:
        return 'script'
    elif '/hooks/' in path_str:
        return 'hook'
    elif '/prompts/' in path_str:
        return 'prompt'
    elif '/workflows/' in path_str:
        return 'workflow'
    elif '/guides/' in path_str or '/getting-started/' in path_str:
        return 'guide'
    elif '/reference/' in path_str:
        return 'reference'
    elif '/adrs/' in path_str:
        return 'adr'
    else:
        return 'unknown'

def compute_file_hash(self, file_path: Path) -> str:
    """Compute SHA256 hash of file content."""
    with open(file_path, 'rb') as f:
        return hashlib.sha256(f.read()).hexdigest()

def discover_components(self, component_type: Optional[str] = None) -> List[Path]:
    """Discover all component files."""
    files = []

    if component_type and component_type in self.COMPONENT_PATTERNS:
        patterns = self.COMPONENT_PATTERNS[component_type]
    else:
        patterns = []
        for p in self.COMPONENT_PATTERNS.values():
            patterns.extend(p)

    for pattern in patterns:
        files.extend(self.project_root.glob(pattern))

    return sorted(set(files))

def _list_to_json(self, value: Any) -> Optional[str]:
    """Convert list to JSON string for storage."""
    if value is None:
        return None
    if isinstance(value, list):
        return json.dumps(value)
    return str(value)

def index_component(self, file_path: Path) -> Tuple[str, bool, Optional[str]]:
    """Index a single component. Returns (path, success, error_message)."""
    relative_path = str(file_path.relative_to(self.project_root))

    try:
        file_hash = self.compute_file_hash(file_path)
        frontmatter = self.extract_frontmatter(file_path)
        component_type = self.infer_component_type(file_path)

        # Determine conformance level
        if frontmatter is None:
            conformance_level = 'L0_NONE'
        elif 'title' in frontmatter and 'summary' in frontmatter:
            conformance_level = 'L1_UNIVERSAL'
            # Check for type-specific fields
            if component_type == 'agent' and 'moe_role' in frontmatter:
                conformance_level = 'L2_TYPED'
                if 'moe_capabilities' in frontmatter:
                    conformance_level = 'L3_FULL'
            elif component_type == 'command' and 'command_name' in frontmatter:
                conformance_level = 'L2_TYPED'
            elif component_type == 'skill' and 'skill_name' in frontmatter:
                conformance_level = 'L2_TYPED'
        else:
            conformance_level = 'L0_NONE'

        cursor = self.conn.cursor()

        # Upsert component
        cursor.execute("""
            INSERT OR REPLACE INTO component_frontmatter (
                path, hash, component_type, title, version, audience, status,
                summary, keywords, tokens, created_date, updated_date,
                agent_type, domain, moe_role, moe_capabilities, invocation_pattern,
                command_name, aliases, usage,
                skill_name, skill_category, when_to_use, composes_with,
                script_name, language, dependencies,
                raw_frontmatter, indexed_at, conformance_level
            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        """, (
            relative_path,
            file_hash,
            component_type,
            frontmatter.get('title') if frontmatter else None,
            frontmatter.get('version') if frontmatter else None,
            frontmatter.get('audience') if frontmatter else None,
            frontmatter.get('status') if frontmatter else None,
            frontmatter.get('summary') if frontmatter else None,
            self._list_to_json(frontmatter.get('keywords')) if frontmatter else None,
            frontmatter.get('tokens') if frontmatter else None,
            frontmatter.get('created') if frontmatter else None,
            frontmatter.get('updated') if frontmatter else None,
            frontmatter.get('agent_type') if frontmatter else None,
            self._list_to_json(frontmatter.get('domain')) if frontmatter else None,
            frontmatter.get('moe_role') if frontmatter else None,
            self._list_to_json(frontmatter.get('moe_capabilities')) if frontmatter else None,
            frontmatter.get('invocation_pattern') if frontmatter else None,
            frontmatter.get('command_name') if frontmatter else None,
            self._list_to_json(frontmatter.get('aliases')) if frontmatter else None,
            frontmatter.get('usage') if frontmatter else None,
            frontmatter.get('skill_name') if frontmatter else None,
            frontmatter.get('skill_category') if frontmatter else None,
            frontmatter.get('when_to_use') if frontmatter else None,
            self._list_to_json(frontmatter.get('composes_with')) if frontmatter else None,
            frontmatter.get('script_name') if frontmatter else None,
            frontmatter.get('language') if frontmatter else None,
            self._list_to_json(frontmatter.get('dependencies')) if frontmatter else None,
            json.dumps(frontmatter) if frontmatter else None,
            datetime.now(timezone.utc).isoformat(),
            conformance_level
        ))

        return (relative_path, True, None)

    except Exception as e:
        return (relative_path, False, str(e))

def index_all(self, component_type: Optional[str] = None) -> Dict[str, Any]:
    """Index all components."""
    self.connect()

    files = self.discover_components(component_type)
    results = {"indexed": 0, "failed": 0, "errors": []}

    for file_path in files:
        if file_path.name.startswith('.'):
            continue

        path, success, error = self.index_component(file_path)
        if success:
            results["indexed"] += 1
        else:
            results["failed"] += 1
            results["errors"].append({"path": path, "error": error})

    self.conn.commit()
    self.close()

    return results

def search(self, query: str, limit: int = 20) -> List[Dict[str, Any]]:
    """Search components using FTS."""
    self.connect()
    cursor = self.conn.cursor()

    cursor.execute("""
        SELECT c.*,
               bm25(component_frontmatter_fts) as relevance
        FROM component_frontmatter c
        JOIN component_frontmatter_fts ON c.id = component_frontmatter_fts.rowid
        WHERE component_frontmatter_fts MATCH ?
        ORDER BY relevance
        LIMIT ?
    """, (query, limit))

    results = []
    for row in cursor.fetchall():
        results.append({
            "path": row["path"],
            "type": row["component_type"],
            "title": row["title"],
            "summary": row["summary"],
            "moe_role": row["moe_role"],
            "conformance": row["conformance_level"],
            "relevance": row["relevance"]
        })

    self.close()
    return results

def get_stats(self) -> Dict[str, Any]:
    """Get indexing statistics."""
    self.connect()
    cursor = self.conn.cursor()

    # Total count
    cursor.execute("SELECT COUNT(*) FROM component_frontmatter")
    total = cursor.fetchone()[0]

    # By type
    cursor.execute("""
        SELECT component_type, COUNT(*) as count
        FROM component_frontmatter
        GROUP BY component_type
        ORDER BY count DESC
    """)
    by_type = {row[0]: row[1] for row in cursor.fetchall()}

    # By conformance
    cursor.execute("""
        SELECT conformance_level, COUNT(*) as count
        FROM component_frontmatter
        GROUP BY conformance_level
        ORDER BY conformance_level
    """)
    by_conformance = {row[0]: row[1] for row in cursor.fetchall()}

    # By MoE role
    cursor.execute("""
        SELECT moe_role, COUNT(*) as count
        FROM component_frontmatter
        WHERE moe_role IS NOT NULL
        GROUP BY moe_role
        ORDER BY count DESC
    """)
    by_moe = {row[0]: row[1] for row in cursor.fetchall()}

    # Last indexed
    cursor.execute("SELECT MAX(indexed_at) FROM component_frontmatter")
    last_indexed = cursor.fetchone()[0]

    self.close()

    return {
        "total_components": total,
        "by_type": by_type,
        "by_conformance": by_conformance,
        "by_moe_role": by_moe,
        "last_indexed": last_indexed
    }

def list_by_moe_role(self, role: str) -> List[Dict[str, Any]]:
    """List all components with a specific MoE role."""
    self.connect()
    cursor = self.conn.cursor()

    cursor.execute("""
        SELECT path, title, summary, domain, moe_capabilities
        FROM component_frontmatter
        WHERE moe_role = ?
        ORDER BY title
    """, (role,))

    results = []
    for row in cursor.fetchall():
        results.append({
            "path": row["path"],
            "title": row["title"],
            "summary": row["summary"],
            "domain": row["domain"],
            "capabilities": row["moe_capabilities"]
        })

    self.close()
    return results

def main(): parser = argparse.ArgumentParser( description="Index component frontmatter into context database", epilog="Part of CODITECT context extraction pipeline" ) parser.add_argument("--init", action="store_true", help="Initialize database schema") parser.add_argument("--stats", action="store_true", help="Show indexing statistics") parser.add_argument("--search", metavar="QUERY", help="Search components") parser.add_argument("--type", metavar="TYPE", help="Index specific component type") parser.add_argument("--list-moe", metavar="ROLE", help="List components by MoE role") parser.add_argument("--json", action="store_true", help="Output as JSON") args = parser.parse_args()

# Find paths
script_path = Path(__file__).resolve()
project_root = script_path.parent.parent

# ADR-114 & ADR-118: Use centralized path discovery
sys.path.insert(0, str(script_path.parent / "core"))
try:
    from paths import get_platform_db_path, PLATFORM_DB
    db_path = PLATFORM_DB  # Component data goes to platform.db (Tier 1)
except ImportError:
    # Fallback for backward compatibility
    _user_data = Path.home() / "PROJECTS" / ".coditect-data" / "context-storage"
    if _user_data.exists():
        db_path = _user_data / "platform.db"
    else:
        db_path = project_root / "context-storage" / "platform.db"

indexer = ComponentFrontmatterIndexer(project_root, db_path)

if args.init:
    indexer.initialize_schema()
    return

if args.stats:
    stats = indexer.get_stats()
    if args.json:
        print(json.dumps(stats, indent=2))
    else:
        print("\n📊 Component Index Statistics")
        print("=" * 50)
        print(f"Total Indexed: {stats['total_components']}")
        print(f"\nBy Type:")
        for t, count in stats['by_type'].items():
            print(f"  {t}: {count}")
        print(f"\nBy Conformance:")
        for level, count in stats['by_conformance'].items():
            print(f"  {level}: {count}")
        if stats['by_moe_role']:
            print(f"\nBy MoE Role:")
            for role, count in stats['by_moe_role'].items():
                print(f"  {role}: {count}")
        print(f"\nLast Indexed: {stats['last_indexed']}")
    return

if args.search:
    results = indexer.search(args.search)
    if args.json:
        print(json.dumps(results, indent=2))
    else:
        print(f"\n🔍 Search Results for '{args.search}'")
        print("=" * 50)
        for r in results:
            print(f"\n{r['title'] or r['path']}")
            print(f"  Type: {r['type']} | MoE: {r['moe_role'] or 'N/A'}")
            print(f"  Path: {r['path']}")
            if r['summary']:
                print(f"  Summary: {r['summary'][:80]}...")
    return

if args.list_moe:
    results = indexer.list_by_moe_role(args.list_moe)
    if args.json:
        print(json.dumps(results, indent=2))
    else:
        print(f"\n🎯 Components with MoE Role: {args.list_moe}")
        print("=" * 50)
        for r in results:
            print(f"\n{r['title'] or r['path']}")
            if r['summary']:
                print(f"  {r['summary'][:80]}...")
    return

# Default: index all
print("🔄 Indexing components...")
results = indexer.index_all(component_type=args.type)

if args.json:
    print(json.dumps(results, indent=2))
else:
    print(f"\n✓ Indexed: {results['indexed']}")
    if results['failed'] > 0:
        print(f"✗ Failed: {results['failed']}")
        for err in results['errors'][:5]:
            print(f"  - {err['path']}: {err['error']}")

if name == "main": main()

#!/usr/bin/env python3 """​

#!/usr/bin/env python3 """