Skip to main content

scripts-component-frontmatter-indexer

#!/usr/bin/env python3 """​

title: "Component Frontmatter Indexer" component_type: script version: "1.0.0" audience: contributor status: stable summary: "Index component YAML frontmatter into context database for /cxq search" keywords: ['component', 'database', 'frontmatter', 'indexer', 'search'] tokens: ~500 created: 2025-12-22 updated: 2025-12-22 script_name: "component-frontmatter-indexer.py" language: python executable: true usage: "python3 scripts/component-frontmatter-indexer.py [options]" python_version: "3.10+" dependencies: [] modifies_files: true network_access: false requires_auth: false​

CODITECT Component Frontmatter Indexer

Indexes all component YAML frontmatter into the context database for searchability via /cxq. Integrates with the context extraction pipeline.

Usage: python3 scripts/component-frontmatter-indexer.py # Index all components python3 scripts/component-frontmatter-indexer.py --init # Initialize schema python3 scripts/component-frontmatter-indexer.py --stats # Show statistics python3 scripts/component-frontmatter-indexer.py --search "keyword" # Search python3 scripts/component-frontmatter-indexer.py --type agent # Index specific type

Author: AZ1.AI INC Version: 1.0.0 ADR: ADR-018-AGENTIC-DOCUMENTATION-STANDARD """

import argparse import hashlib import json import re import sqlite3 import sys from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List, Optional, Tuple

class ComponentFrontmatterIndexer: """Index component frontmatter into the context database."""

SCHEMA_VERSION = "1.0.0"

# Component discovery patterns
COMPONENT_PATTERNS = {
'agent': ['agents/*.md'],
'command': ['commands/*.md'],
'skill': ['skills/*/SKILL.md', 'skills/*/*.md'],
'script': ['scripts/*.py'],
'hook': ['hooks/*.py', 'hooks/*.md'],
'prompt': ['prompts/*.md'],
'workflow': ['docs/workflows/*.md'],
'guide': ['docs/guides/*.md', 'docs/getting-started/*.md'],
'reference': ['docs/reference/*.md'],
# ADR-213: ADRs now in coditect-documentation; keep local path as fallback
'adr': ['../../../docs/coditect-documentation/coditect-core/adrs/*.md',
'internal/architecture/adrs/*.md']
}

def __init__(self, project_root: Path, db_path: Path):
self.project_root = project_root
self.db_path = db_path
self.conn = None

def connect(self):
"""Connect to the database."""
self.conn = sqlite3.connect(str(self.db_path))
self.conn.row_factory = sqlite3.Row

def close(self):
"""Close the database connection."""
if self.conn:
self.conn.close()

def initialize_schema(self):
"""Create the component frontmatter tables."""
self.connect()
cursor = self.conn.cursor()

# Main component_frontmatter table (separate from existing components registry)
cursor.execute("""
CREATE TABLE IF NOT EXISTS component_frontmatter (
id INTEGER PRIMARY KEY AUTOINCREMENT,
path TEXT UNIQUE NOT NULL,
hash TEXT NOT NULL,
component_type TEXT NOT NULL,
title TEXT,
version TEXT,
audience TEXT,
status TEXT,
summary TEXT,
keywords TEXT,
tokens INTEGER,
created_date TEXT,
updated_date TEXT,

-- Agent-specific fields
agent_type TEXT,
domain TEXT,
moe_role TEXT,
moe_capabilities TEXT,
invocation_pattern TEXT,

-- Command-specific fields
command_name TEXT,
aliases TEXT,
usage TEXT,

-- Skill-specific fields
skill_name TEXT,
skill_category TEXT,
when_to_use TEXT,
composes_with TEXT,

-- Script-specific fields
script_name TEXT,
language TEXT,
dependencies TEXT,

-- Metadata
raw_frontmatter TEXT,
indexed_at TEXT DEFAULT CURRENT_TIMESTAMP,
conformance_level TEXT,

-- Cross-references
related_agents TEXT,
related_commands TEXT,
related_skills TEXT
)
""")

# Full-text search index
cursor.execute("""
CREATE VIRTUAL TABLE IF NOT EXISTS component_frontmatter_fts USING fts5(
path,
title,
summary,
keywords,
domain,
moe_role,
when_to_use,
content='component_frontmatter',
content_rowid='id'
)
""")

# Triggers to keep FTS in sync
cursor.execute("""
CREATE TRIGGER IF NOT EXISTS component_frontmatter_ai AFTER INSERT ON component_frontmatter BEGIN
INSERT INTO component_frontmatter_fts(rowid, path, title, summary, keywords, domain, moe_role, when_to_use)
VALUES (new.id, new.path, new.title, new.summary, new.keywords, new.domain, new.moe_role, new.when_to_use);
END
""")

cursor.execute("""
CREATE TRIGGER IF NOT EXISTS component_frontmatter_ad AFTER DELETE ON component_frontmatter BEGIN
INSERT INTO component_frontmatter_fts(component_frontmatter_fts, rowid, path, title, summary, keywords, domain, moe_role, when_to_use)
VALUES ('delete', old.id, old.path, old.title, old.summary, old.keywords, old.domain, old.moe_role, old.when_to_use);
END
""")

cursor.execute("""
CREATE TRIGGER IF NOT EXISTS component_frontmatter_au AFTER UPDATE ON component_frontmatter BEGIN
INSERT INTO component_frontmatter_fts(component_frontmatter_fts, rowid, path, title, summary, keywords, domain, moe_role, when_to_use)
VALUES ('delete', old.id, old.path, old.title, old.summary, old.keywords, old.domain, old.moe_role, old.when_to_use);
INSERT INTO component_frontmatter_fts(rowid, path, title, summary, keywords, domain, moe_role, when_to_use)
VALUES (new.id, new.path, new.title, new.summary, new.keywords, new.domain, new.moe_role, new.when_to_use);
END
""")

# Cross-reference table for relationships
cursor.execute("""
CREATE TABLE IF NOT EXISTS component_frontmatter_relations (
id INTEGER PRIMARY KEY AUTOINCREMENT,
source_id INTEGER NOT NULL,
target_id INTEGER NOT NULL,
relation_type TEXT NOT NULL,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (source_id) REFERENCES component_frontmatter(id) ON DELETE CASCADE,
FOREIGN KEY (target_id) REFERENCES component_frontmatter(id) ON DELETE CASCADE,
UNIQUE(source_id, target_id, relation_type)
)
""")

# Indexes
cursor.execute("CREATE INDEX IF NOT EXISTS idx_cf_type ON component_frontmatter(component_type)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_cf_moe ON component_frontmatter(moe_role)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_cf_status ON component_frontmatter(status)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_cf_domain ON component_frontmatter(domain)")

# Schema version tracking
cursor.execute("""
CREATE TABLE IF NOT EXISTS component_schema_info (
key TEXT PRIMARY KEY,
value TEXT
)
""")
cursor.execute("""
INSERT OR REPLACE INTO component_schema_info (key, value)
VALUES ('version', ?), ('initialized_at', ?)
""", (self.SCHEMA_VERSION, datetime.now(timezone.utc).isoformat()))

self.conn.commit()
print(f"āœ“ Schema initialized (version {self.SCHEMA_VERSION})")
self.close()

def extract_frontmatter(self, file_path: Path) -> Optional[Dict[str, Any]]:
"""Extract YAML frontmatter from a file."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()

# Check for frontmatter delimiters
if not content.startswith('---'):
return None

# Find closing delimiter
lines = content.split('\n')
end_idx = None
for i, line in enumerate(lines[1:], 1):
if line.strip() == '---':
end_idx = i
break

if end_idx is None:
return None

# Parse YAML (simple parser for common cases)
frontmatter_text = '\n'.join(lines[1:end_idx])
frontmatter = self._parse_yaml(frontmatter_text)
return frontmatter

except Exception as e:
return None

def _parse_yaml(self, text: str) -> Dict[str, Any]:
"""Simple YAML parser for frontmatter."""
frontmatter = {}
current_key = None
current_list = None

for line in text.split('\n'):
line = line.rstrip()

# Skip empty lines and comments
if not line or line.strip().startswith('#'):
continue

# Check for list item
if line.startswith(' - ') and current_key:
if current_list is None:
current_list = []
frontmatter[current_key] = current_list
current_list.append(line.strip()[2:].strip())
continue

# Check for key: value
if ':' in line and not line.startswith(' '):
current_list = None
key, _, value = line.partition(':')
key = key.strip()
value = value.strip()
current_key = key

# Handle inline list [a, b, c]
if value.startswith('[') and value.endswith(']'):
items = value[1:-1].split(',')
frontmatter[key] = [item.strip().strip('"\'') for item in items if item.strip()]
elif value.startswith('"') and value.endswith('"'):
frontmatter[key] = value[1:-1]
elif value.startswith("'") and value.endswith("'"):
frontmatter[key] = value[1:-1]
elif value.lower() in ('true', 'false'):
frontmatter[key] = value.lower() == 'true'
elif value.startswith('~'):
# Token estimate like ~2000
try:
frontmatter[key] = int(value[1:])
except:
frontmatter[key] = value
elif value.isdigit():
frontmatter[key] = int(value)
elif not value:
frontmatter[key] = None
else:
frontmatter[key] = value

return frontmatter

def infer_component_type(self, file_path: Path) -> str:
"""Infer component type from file path."""
path_str = str(file_path)

if '/agents/' in path_str:
return 'agent'
elif '/commands/' in path_str:
return 'command'
elif '/skills/' in path_str:
return 'skill'
elif '/scripts/' in path_str:
return 'script'
elif '/hooks/' in path_str:
return 'hook'
elif '/prompts/' in path_str:
return 'prompt'
elif '/workflows/' in path_str:
return 'workflow'
elif '/guides/' in path_str or '/getting-started/' in path_str:
return 'guide'
elif '/reference/' in path_str:
return 'reference'
elif '/adrs/' in path_str:
return 'adr'
else:
return 'unknown'

def compute_file_hash(self, file_path: Path) -> str:
"""Compute SHA256 hash of file content."""
with open(file_path, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()

def discover_components(self, component_type: Optional[str] = None) -> List[Path]:
"""Discover all component files."""
files = []

if component_type and component_type in self.COMPONENT_PATTERNS:
patterns = self.COMPONENT_PATTERNS[component_type]
else:
patterns = []
for p in self.COMPONENT_PATTERNS.values():
patterns.extend(p)

for pattern in patterns:
files.extend(self.project_root.glob(pattern))

return sorted(set(files))

def _list_to_json(self, value: Any) -> Optional[str]:
"""Convert list to JSON string for storage."""
if value is None:
return None
if isinstance(value, list):
return json.dumps(value)
return str(value)

def index_component(self, file_path: Path) -> Tuple[str, bool, Optional[str]]:
"""Index a single component. Returns (path, success, error_message)."""
relative_path = str(file_path.relative_to(self.project_root))

try:
file_hash = self.compute_file_hash(file_path)
frontmatter = self.extract_frontmatter(file_path)
component_type = self.infer_component_type(file_path)

# Determine conformance level
if frontmatter is None:
conformance_level = 'L0_NONE'
elif 'title' in frontmatter and 'summary' in frontmatter:
conformance_level = 'L1_UNIVERSAL'
# Check for type-specific fields
if component_type == 'agent' and 'moe_role' in frontmatter:
conformance_level = 'L2_TYPED'
if 'moe_capabilities' in frontmatter:
conformance_level = 'L3_FULL'
elif component_type == 'command' and 'command_name' in frontmatter:
conformance_level = 'L2_TYPED'
elif component_type == 'skill' and 'skill_name' in frontmatter:
conformance_level = 'L2_TYPED'
else:
conformance_level = 'L0_NONE'

cursor = self.conn.cursor()

# Upsert component
cursor.execute("""
INSERT OR REPLACE INTO component_frontmatter (
path, hash, component_type, title, version, audience, status,
summary, keywords, tokens, created_date, updated_date,
agent_type, domain, moe_role, moe_capabilities, invocation_pattern,
command_name, aliases, usage,
skill_name, skill_category, when_to_use, composes_with,
script_name, language, dependencies,
raw_frontmatter, indexed_at, conformance_level
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
relative_path,
file_hash,
component_type,
frontmatter.get('title') if frontmatter else None,
frontmatter.get('version') if frontmatter else None,
frontmatter.get('audience') if frontmatter else None,
frontmatter.get('status') if frontmatter else None,
frontmatter.get('summary') if frontmatter else None,
self._list_to_json(frontmatter.get('keywords')) if frontmatter else None,
frontmatter.get('tokens') if frontmatter else None,
frontmatter.get('created') if frontmatter else None,
frontmatter.get('updated') if frontmatter else None,
frontmatter.get('agent_type') if frontmatter else None,
self._list_to_json(frontmatter.get('domain')) if frontmatter else None,
frontmatter.get('moe_role') if frontmatter else None,
self._list_to_json(frontmatter.get('moe_capabilities')) if frontmatter else None,
frontmatter.get('invocation_pattern') if frontmatter else None,
frontmatter.get('command_name') if frontmatter else None,
self._list_to_json(frontmatter.get('aliases')) if frontmatter else None,
frontmatter.get('usage') if frontmatter else None,
frontmatter.get('skill_name') if frontmatter else None,
frontmatter.get('skill_category') if frontmatter else None,
frontmatter.get('when_to_use') if frontmatter else None,
self._list_to_json(frontmatter.get('composes_with')) if frontmatter else None,
frontmatter.get('script_name') if frontmatter else None,
frontmatter.get('language') if frontmatter else None,
self._list_to_json(frontmatter.get('dependencies')) if frontmatter else None,
json.dumps(frontmatter) if frontmatter else None,
datetime.now(timezone.utc).isoformat(),
conformance_level
))

return (relative_path, True, None)

except Exception as e:
return (relative_path, False, str(e))

def index_all(self, component_type: Optional[str] = None) -> Dict[str, Any]:
"""Index all components."""
self.connect()

files = self.discover_components(component_type)
results = {"indexed": 0, "failed": 0, "errors": []}

for file_path in files:
if file_path.name.startswith('.'):
continue

path, success, error = self.index_component(file_path)
if success:
results["indexed"] += 1
else:
results["failed"] += 1
results["errors"].append({"path": path, "error": error})

self.conn.commit()
self.close()

return results

def search(self, query: str, limit: int = 20) -> List[Dict[str, Any]]:
"""Search components using FTS."""
self.connect()
cursor = self.conn.cursor()

cursor.execute("""
SELECT c.*,
bm25(component_frontmatter_fts) as relevance
FROM component_frontmatter c
JOIN component_frontmatter_fts ON c.id = component_frontmatter_fts.rowid
WHERE component_frontmatter_fts MATCH ?
ORDER BY relevance
LIMIT ?
""", (query, limit))

results = []
for row in cursor.fetchall():
results.append({
"path": row["path"],
"type": row["component_type"],
"title": row["title"],
"summary": row["summary"],
"moe_role": row["moe_role"],
"conformance": row["conformance_level"],
"relevance": row["relevance"]
})

self.close()
return results

def get_stats(self) -> Dict[str, Any]:
"""Get indexing statistics."""
self.connect()
cursor = self.conn.cursor()

# Total count
cursor.execute("SELECT COUNT(*) FROM component_frontmatter")
total = cursor.fetchone()[0]

# By type
cursor.execute("""
SELECT component_type, COUNT(*) as count
FROM component_frontmatter
GROUP BY component_type
ORDER BY count DESC
""")
by_type = {row[0]: row[1] for row in cursor.fetchall()}

# By conformance
cursor.execute("""
SELECT conformance_level, COUNT(*) as count
FROM component_frontmatter
GROUP BY conformance_level
ORDER BY conformance_level
""")
by_conformance = {row[0]: row[1] for row in cursor.fetchall()}

# By MoE role
cursor.execute("""
SELECT moe_role, COUNT(*) as count
FROM component_frontmatter
WHERE moe_role IS NOT NULL
GROUP BY moe_role
ORDER BY count DESC
""")
by_moe = {row[0]: row[1] for row in cursor.fetchall()}

# Last indexed
cursor.execute("SELECT MAX(indexed_at) FROM component_frontmatter")
last_indexed = cursor.fetchone()[0]

self.close()

return {
"total_components": total,
"by_type": by_type,
"by_conformance": by_conformance,
"by_moe_role": by_moe,
"last_indexed": last_indexed
}

def list_by_moe_role(self, role: str) -> List[Dict[str, Any]]:
"""List all components with a specific MoE role."""
self.connect()
cursor = self.conn.cursor()

cursor.execute("""
SELECT path, title, summary, domain, moe_capabilities
FROM component_frontmatter
WHERE moe_role = ?
ORDER BY title
""", (role,))

results = []
for row in cursor.fetchall():
results.append({
"path": row["path"],
"title": row["title"],
"summary": row["summary"],
"domain": row["domain"],
"capabilities": row["moe_capabilities"]
})

self.close()
return results

def main(): parser = argparse.ArgumentParser( description="Index component frontmatter into context database", epilog="Part of CODITECT context extraction pipeline" ) parser.add_argument("--init", action="store_true", help="Initialize database schema") parser.add_argument("--stats", action="store_true", help="Show indexing statistics") parser.add_argument("--search", metavar="QUERY", help="Search components") parser.add_argument("--type", metavar="TYPE", help="Index specific component type") parser.add_argument("--list-moe", metavar="ROLE", help="List components by MoE role") parser.add_argument("--json", action="store_true", help="Output as JSON") args = parser.parse_args()

# Find paths
script_path = Path(__file__).resolve()
project_root = script_path.parent.parent

# ADR-114 & ADR-118: Use centralized path discovery
sys.path.insert(0, str(script_path.parent / "core"))
try:
from paths import get_platform_db_path, PLATFORM_DB
db_path = PLATFORM_DB # Component data goes to platform.db (Tier 1)
except ImportError:
# Fallback for backward compatibility
_user_data = Path.home() / "PROJECTS" / ".coditect-data" / "context-storage"
if _user_data.exists():
db_path = _user_data / "platform.db"
else:
db_path = project_root / "context-storage" / "platform.db"

indexer = ComponentFrontmatterIndexer(project_root, db_path)

if args.init:
indexer.initialize_schema()
return

if args.stats:
stats = indexer.get_stats()
if args.json:
print(json.dumps(stats, indent=2))
else:
print("\nšŸ“Š Component Index Statistics")
print("=" * 50)
print(f"Total Indexed: {stats['total_components']}")
print(f"\nBy Type:")
for t, count in stats['by_type'].items():
print(f" {t}: {count}")
print(f"\nBy Conformance:")
for level, count in stats['by_conformance'].items():
print(f" {level}: {count}")
if stats['by_moe_role']:
print(f"\nBy MoE Role:")
for role, count in stats['by_moe_role'].items():
print(f" {role}: {count}")
print(f"\nLast Indexed: {stats['last_indexed']}")
return

if args.search:
results = indexer.search(args.search)
if args.json:
print(json.dumps(results, indent=2))
else:
print(f"\nšŸ” Search Results for '{args.search}'")
print("=" * 50)
for r in results:
print(f"\n{r['title'] or r['path']}")
print(f" Type: {r['type']} | MoE: {r['moe_role'] or 'N/A'}")
print(f" Path: {r['path']}")
if r['summary']:
print(f" Summary: {r['summary'][:80]}...")
return

if args.list_moe:
results = indexer.list_by_moe_role(args.list_moe)
if args.json:
print(json.dumps(results, indent=2))
else:
print(f"\nšŸŽÆ Components with MoE Role: {args.list_moe}")
print("=" * 50)
for r in results:
print(f"\n{r['title'] or r['path']}")
if r['summary']:
print(f" {r['summary'][:80]}...")
return

# Default: index all
print("šŸ”„ Indexing components...")
results = indexer.index_all(component_type=args.type)

if args.json:
print(json.dumps(results, indent=2))
else:
print(f"\nāœ“ Indexed: {results['indexed']}")
if results['failed'] > 0:
print(f"āœ— Failed: {results['failed']}")
for err in results['errors'][:5]:
print(f" - {err['path']}: {err['error']}")

if name == "main": main()