#!/usr/bin/env python3 """ Platform Index Database Management (H.5.7.1)
Creates and manages platform-index.db for framework component embeddings with hash-based change tracking for incremental updates.
Part of ADR-103: Four-Database Separation Architecture
Usage: python3 platform-index-db.py --init # Initialize database python3 platform-index-db.py --index # Index all components python3 platform-index-db.py --index-changed # Index only changed components python3 platform-index-db.py --stats # Show statistics python3 platform-index-db.py --verify # Verify database integrity """
import argparse import hashlib import json import os import sqlite3 import sys import time from datetime import datetime from pathlib import Path from typing import Dict, List, Optional, Tuple, Any from concurrent.futures import ThreadPoolExecutor, as_completed
ADR-114 & ADR-118: Use centralized path discovery
Platform-index.db is Tier 1 data (regenerable component embeddings)
CODITECT_HOME = Path.home() / ".coditect" # Framework install USER_DATA_DIR = Path.home() / "PROJECTS" / ".coditect-data" CONTEXT_STORAGE = USER_DATA_DIR / "context-storage" if USER_DATA_DIR.exists() else CODITECT_HOME / "context-storage" PLATFORM_INDEX_DB = CONTEXT_STORAGE / "platform-index.db" PLATFORM_DB = CONTEXT_STORAGE / "platform.db"
Component directories to index
COMPONENT_DIRS = { "agent": CODITECT_HOME / "agents", "skill": CODITECT_HOME / "skills", "command": CODITECT_HOME / "commands", "hook": CODITECT_HOME / "hooks", "script": CODITECT_HOME / "scripts", }
File patterns per component type
FILE_PATTERNS = { "agent": [".md"], "skill": ["/SKILL.md"], # Skills are in subdirectories "command": [".md"], "hook": [".py", ".sh"], "script": [".py", "*.sh"], }
Schema version for migrations
SCHEMA_VERSION = 1
SCHEMA = """ -- Schema version tracking CREATE TABLE IF NOT EXISTS schema_version ( version INTEGER PRIMARY KEY, applied_at TEXT DEFAULT CURRENT_TIMESTAMP );
-- Embedding model metadata CREATE TABLE IF NOT EXISTS embedding_models ( id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT UNIQUE NOT NULL, dimensions INTEGER NOT NULL, version TEXT, created_at TEXT DEFAULT CURRENT_TIMESTAMP );
-- Hash tracking for incremental updates CREATE TABLE IF NOT EXISTS component_hashes ( id INTEGER PRIMARY KEY AUTOINCREMENT, file_path TEXT UNIQUE NOT NULL, content_hash TEXT NOT NULL, file_size INTEGER, mtime REAL, component_type TEXT NOT NULL, component_id TEXT, chunk_count INTEGER DEFAULT 1, indexed_at TEXT DEFAULT CURRENT_TIMESTAMP, last_checked TEXT DEFAULT CURRENT_TIMESTAMP );
-- Framework component embeddings CREATE TABLE IF NOT EXISTS component_embeddings ( id INTEGER PRIMARY KEY AUTOINCREMENT, component_id TEXT NOT NULL, file_path TEXT NOT NULL, content_hash TEXT NOT NULL, component_type TEXT NOT NULL, chunk_index INTEGER DEFAULT 0, chunk_total INTEGER DEFAULT 1, content_preview TEXT, embedding BLOB, model TEXT, metadata TEXT, created_at TEXT DEFAULT CURRENT_TIMESTAMP, updated_at TEXT DEFAULT CURRENT_TIMESTAMP, UNIQUE(component_id, chunk_index) );
-- Indexes for fast lookup CREATE INDEX IF NOT EXISTS idx_component_hashes_type ON component_hashes(component_type); CREATE INDEX IF NOT EXISTS idx_component_hashes_hash ON component_hashes(content_hash); CREATE INDEX IF NOT EXISTS idx_component_embeddings_id ON component_embeddings(component_id); CREATE INDEX IF NOT EXISTS idx_component_embeddings_type ON component_embeddings(component_type); CREATE INDEX IF NOT EXISTS idx_component_embeddings_hash ON component_embeddings(content_hash);
-- Insert default embedding model INSERT OR IGNORE INTO embedding_models (name, dimensions, version) VALUES ('all-MiniLM-L6-v2', 384, '2.2.2');
-- Insert schema version INSERT OR IGNORE INTO schema_version (version) VALUES (1); """
def compute_content_hash(file_path: Path) -> str: """Compute SHA256 hash of file content.""" with open(file_path, 'rb') as f: return hashlib.sha256(f.read()).hexdigest()
def get_component_id(file_path: Path, component_type: str) -> str: """Generate unique component ID from path and type.""" # For skills, use the parent directory name if component_type == "skill": return f"skill/{file_path.parent.name}" # For others, use the stem return f"{component_type}/{file_path.stem}"
def init_database() -> sqlite3.Connection: """Initialize the platform-index.db database.""" CONTEXT_STORAGE.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(str(PLATFORM_INDEX_DB))
conn.row_factory = sqlite3.Row
# Enable WAL mode for concurrent access
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA synchronous=NORMAL")
conn.execute("PRAGMA cache_size=-64000") # 64MB cache
# Create schema
conn.executescript(SCHEMA)
conn.commit()
return conn
def get_connection() -> sqlite3.Connection: """Get database connection.""" if not PLATFORM_INDEX_DB.exists(): return init_database()
conn = sqlite3.connect(str(PLATFORM_INDEX_DB))
conn.row_factory = sqlite3.Row
return conn
def discover_components() -> Dict[str, List[Path]]: """Discover all framework components by type.""" components = {}
for comp_type, base_dir in COMPONENT_DIRS.items():
if not base_dir.exists():
continue
patterns = FILE_PATTERNS.get(comp_type, ["*.md"])
files = []
for pattern in patterns:
files.extend(base_dir.glob(pattern))
# Filter out non-files and hidden files
files = [f for f in files if f.is_file() and not f.name.startswith('.')]
components[comp_type] = sorted(files)
return components
def check_file_changed(conn: sqlite3.Connection, file_path: Path) -> Tuple[bool, Optional[str]]: """Check if file has changed since last index.""" cursor = conn.execute( "SELECT content_hash, mtime FROM component_hashes WHERE file_path = ?", (str(file_path),) ) row = cursor.fetchone()
if not row:
# New file
return True, None
# Quick check: mtime
current_mtime = file_path.stat().st_mtime
if current_mtime == row['mtime']:
return False, row['content_hash']
# Mtime changed, verify with hash
current_hash = compute_content_hash(file_path)
if current_hash == row['content_hash']:
# Content same, update mtime
conn.execute(
"UPDATE component_hashes SET mtime = ?, last_checked = ? WHERE file_path = ?",
(current_mtime, datetime.utcnow().isoformat(), str(file_path))
)
return False, current_hash
return True, current_hash
def index_component(conn: sqlite3.Connection, file_path: Path, component_type: str, content_hash: str, generate_embeddings: bool = False) -> Dict[str, Any]: """Index a single component.""" component_id = get_component_id(file_path, component_type) content = file_path.read_text(encoding='utf-8', errors='replace') file_stat = file_path.stat()
# Extract preview (first 500 chars)
content_preview = content[:500].strip()
# Extract metadata from frontmatter if present
metadata = {}
if content.startswith('---'):
try:
end_idx = content.find('---', 3)
if end_idx > 0:
import yaml
frontmatter = yaml.safe_load(content[3:end_idx])
if isinstance(frontmatter, dict):
metadata = {
'title': frontmatter.get('title', ''),
'summary': frontmatter.get('summary', ''),
'keywords': frontmatter.get('keywords', []),
'status': frontmatter.get('status', ''),
}
except Exception:
pass
# Update hash tracking
conn.execute("""
INSERT OR REPLACE INTO component_hashes
(file_path, content_hash, file_size, mtime, component_type, component_id, chunk_count, indexed_at, last_checked)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
str(file_path),
content_hash,
file_stat.st_size,
file_stat.st_mtime,
component_type,
component_id,
1, # Single chunk for now
datetime.utcnow().isoformat(),
datetime.utcnow().isoformat()
))
# Insert embedding record (without actual embedding for now)
conn.execute("""
INSERT OR REPLACE INTO component_embeddings
(component_id, file_path, content_hash, component_type, chunk_index, chunk_total,
content_preview, embedding, model, metadata, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
component_id,
str(file_path),
content_hash,
component_type,
0,
1,
content_preview,
None, # Embedding to be generated separately
'all-MiniLM-L6-v2',
json.dumps(metadata),
datetime.utcnow().isoformat()
))
return {
'component_id': component_id,
'file_path': str(file_path),
'content_hash': content_hash,
'component_type': component_type,
'content_length': len(content),
}
def index_all_components(only_changed: bool = False, generate_embeddings: bool = False) -> Dict[str, Any]: """Index all framework components.""" conn = get_connection() components = discover_components()
stats = {
'total_files': 0,
'indexed': 0,
'skipped': 0,
'errors': 0,
'by_type': {},
'start_time': time.time(),
}
for comp_type, files in components.items():
type_stats = {'indexed': 0, 'skipped': 0, 'errors': 0}
for file_path in files:
stats['total_files'] += 1
try:
changed, content_hash = check_file_changed(conn, file_path)
if only_changed and not changed:
stats['skipped'] += 1
type_stats['skipped'] += 1
continue
# Compute hash if not already done
if content_hash is None:
content_hash = compute_content_hash(file_path)
index_component(conn, file_path, comp_type, content_hash, generate_embeddings)
stats['indexed'] += 1
type_stats['indexed'] += 1
except Exception as e:
stats['errors'] += 1
type_stats['errors'] += 1
print(f"Error indexing {file_path}: {e}", file=sys.stderr)
stats['by_type'][comp_type] = type_stats
conn.commit()
conn.close()
stats['elapsed_time'] = time.time() - stats['start_time']
return stats
def get_statistics() -> Dict[str, Any]: """Get database statistics.""" conn = get_connection()
stats = {
'database_path': str(PLATFORM_INDEX_DB),
'database_size': PLATFORM_INDEX_DB.stat().st_size if PLATFORM_INDEX_DB.exists() else 0,
}
# Total components indexed
cursor = conn.execute("SELECT COUNT(*) as count FROM component_hashes")
stats['total_components'] = cursor.fetchone()['count']
# By type
cursor = conn.execute("""
SELECT component_type, COUNT(*) as count
FROM component_hashes
GROUP BY component_type
""")
stats['by_type'] = {row['component_type']: row['count'] for row in cursor.fetchall()}
# Embeddings count
cursor = conn.execute("SELECT COUNT(*) as count FROM component_embeddings WHERE embedding IS NOT NULL")
stats['embeddings_count'] = cursor.fetchone()['count']
# Pending embeddings
cursor = conn.execute("SELECT COUNT(*) as count FROM component_embeddings WHERE embedding IS NULL")
stats['pending_embeddings'] = cursor.fetchone()['count']
# Last indexed
cursor = conn.execute("SELECT MAX(indexed_at) as last FROM component_hashes")
stats['last_indexed'] = cursor.fetchone()['last']
conn.close()
return stats
def verify_database() -> Dict[str, Any]: """Verify database integrity and consistency.""" conn = get_connection()
results = {
'integrity_check': '',
'orphaned_hashes': 0,
'missing_files': [],
'hash_mismatches': [],
}
# SQLite integrity check
cursor = conn.execute("PRAGMA integrity_check")
results['integrity_check'] = cursor.fetchone()[0]
# Check for orphaned embeddings (no matching hash)
cursor = conn.execute("""
SELECT COUNT(*) as count FROM component_embeddings e
LEFT JOIN component_hashes h ON e.content_hash = h.content_hash
WHERE h.id IS NULL
""")
results['orphaned_hashes'] = cursor.fetchone()['count']
# Check for missing files
cursor = conn.execute("SELECT file_path, content_hash FROM component_hashes")
for row in cursor.fetchall():
file_path = Path(row['file_path'])
if not file_path.exists():
results['missing_files'].append(str(file_path))
else:
# Verify hash
current_hash = compute_content_hash(file_path)
if current_hash != row['content_hash']:
results['hash_mismatches'].append({
'file': str(file_path),
'stored': row['content_hash'][:16] + '...',
'current': current_hash[:16] + '...',
})
conn.close()
return results
def print_stats(stats: Dict[str, Any]): """Pretty print statistics.""" print("\n" + "=" * 60) print("Platform Index Database Statistics") print("=" * 60) print(f"\nDatabase: {stats['database_path']}") print(f"Size: {stats['database_size'] / 1024:.1f} KB") print(f"\nTotal Components: {stats['total_components']}") print(f"Embeddings Generated: {stats['embeddings_count']}") print(f"Pending Embeddings: {stats['pending_embeddings']}") print(f"\nBy Type:") for comp_type, count in sorted(stats.get('by_type', {}).items()): print(f" {comp_type}: {count}") print(f"\nLast Indexed: {stats.get('last_indexed', 'Never')}")
def print_index_results(stats: Dict[str, Any]): """Pretty print indexing results.""" print("\n" + "=" * 60) print("Platform Index Complete") print("=" * 60) print(f"\nTotal Files: {stats['total_files']}") print(f"Indexed: {stats['indexed']}") print(f"Skipped (unchanged): {stats['skipped']}") print(f"Errors: {stats['errors']}") print(f"\nBy Type:") for comp_type, type_stats in sorted(stats.get('by_type', {}).items()): print(f" {comp_type}: {type_stats['indexed']} indexed, {type_stats['skipped']} skipped") print(f"\nElapsed Time: {stats['elapsed_time']:.2f}s")
def main(): parser = argparse.ArgumentParser(description="Platform Index Database Management") parser.add_argument('--init', action='store_true', help='Initialize database') parser.add_argument('--index', action='store_true', help='Index all components') parser.add_argument('--index-changed', action='store_true', help='Index only changed components') parser.add_argument('--stats', action='store_true', help='Show statistics') parser.add_argument('--verify', action='store_true', help='Verify database integrity') parser.add_argument('--with-embeddings', action='store_true', help='Generate embeddings (requires sentence-transformers)')
args = parser.parse_args()
if args.init:
print("Initializing platform-index.db...")
conn = init_database()
conn.close()
print(f"Created: {PLATFORM_INDEX_DB}")
return 0
if args.index or args.index_changed:
only_changed = args.index_changed
print(f"Indexing components ({'changed only' if only_changed else 'all'})...")
stats = index_all_components(only_changed=only_changed, generate_embeddings=args.with_embeddings)
print_index_results(stats)
return 0 if stats['errors'] == 0 else 1
if args.stats:
stats = get_statistics()
print_stats(stats)
return 0
if args.verify:
print("Verifying database integrity...")
results = verify_database()
print(f"\nIntegrity Check: {results['integrity_check']}")
print(f"Orphaned Hashes: {results['orphaned_hashes']}")
print(f"Missing Files: {len(results['missing_files'])}")
if results['missing_files']:
for f in results['missing_files'][:5]:
print(f" - {f}")
if len(results['missing_files']) > 5:
print(f" ... and {len(results['missing_files']) - 5} more")
print(f"Hash Mismatches: {len(results['hash_mismatches'])}")
if results['hash_mismatches']:
for m in results['hash_mismatches'][:5]:
print(f" - {m['file']}")
return 0 if results['integrity_check'] == 'ok' else 1
# Default: show stats
stats = get_statistics()
print_stats(stats)
return 0
if name == 'main': sys.exit(main())