#!/usr/bin/env python3 """ Semantic Skill Categorization Script
Categorizes CODITECT skills using:
- Semantic embeddings (if available)
- Frontmatter analysis
- Keyword matching
- Content analysis
Usage: python3 categorize-skills.py --dry-run # Preview categorizations python3 categorize-skills.py --update # Update skill frontmatters python3 categorize-skills.py --use-embeddings # Use semantic embeddings
Track: ARCH.3 (Unified LLM Component Architecture) Author: CODITECT Framework Team Date: 2026-01-29 """
import argparse import json import logging import sys from pathlib import Path from typing import Dict, List, Optional, Tuple
import yaml
Add parent directory to path
sys.path.insert(0, str(Path(file).parent.parent / "tools" / "mcp-skill-server"))
from server import SkillRegistry, SkillMetadata
logging.basicConfig(level=logging.INFO) logger = logging.getLogger("categorize-skills")
class SemanticSkillCategorizer: """Categorizes skills using semantic analysis."""
def __init__(self, framework_path: Optional[Path] = None):
self.registry = SkillRegistry()
self.framework = self._load_framework(framework_path)
self.embeddings_available = self._check_embeddings()
self._embedding_model = None
def _load_framework(self, path: Optional[Path]) -> Dict:
"""Load category framework."""
if path is None:
path = Path(__file__).parent.parent / "config" / "skill-category-framework.yaml"
with open(path) as f:
return yaml.safe_load(f)
def _check_embeddings(self) -> bool:
"""Check if sentence-transformers is available."""
try:
import sentence_transformers
return True
except ImportError:
return False
def _get_embedding_model(self):
"""Get or load embedding model."""
if self._embedding_model is None and self.embeddings_available:
from sentence_transformers import SentenceTransformer
self._embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
return self._embedding_model
def extract_skill_text(self, skill: SkillMetadata) -> str:
"""Extract searchable text from skill."""
parts = [skill.name, skill.description]
# Read SKILL.md for more content
if skill.skill_path:
skill_md = skill.skill_path / "SKILL.md"
if skill_md.exists():
try:
content = skill_md.read_text()
# Get content after frontmatter
if content.startswith("---"):
parts_marker = content.split("---", 2)
if len(parts_marker) >= 3:
content = parts_marker[2]
# First 500 chars of content
parts.append(content[:500])
except Exception:
pass
return " ".join(parts).lower()
def categorize_by_keywords(self, skill: SkillMetadata) -> List[Tuple[str, float]]:
"""Categorize skill using keyword matching."""
text = self.extract_skill_text(skill)
scores = []
for cat_key, cat_data in self.framework["categories"].items():
keywords = cat_data.get("keywords", [])
score = sum(1 for kw in keywords if kw.lower() in text)
if score > 0:
# Normalize by keyword count
normalized = score / len(keywords) if keywords else 0
scores.append((cat_key, normalized))
scores.sort(key=lambda x: x[1], reverse=True)
return scores
def categorize_by_embeddings(self, skill: SkillMetadata) -> List[Tuple[str, float]]:
"""Categorize skill using semantic embeddings."""
if not self.embeddings_available:
return []
model = self._get_embedding_model()
if model is None:
return []
skill_text = self.extract_skill_text(skill)
skill_embedding = model.encode(skill_text)
scores = []
for cat_key, cat_data in self.framework["categories"].items():
# Build category description from exemplars
exemplars = cat_data.get("exemplars", [])
cat_text = f"{cat_data['name']}: {cat_data['description']}"
if exemplars:
cat_text += " " + " ".join(exemplars)
cat_embedding = model.encode(cat_text)
# Cosine similarity
import numpy as np
similarity = np.dot(skill_embedding, cat_embedding) / (
np.linalg.norm(skill_embedding) * np.linalg.norm(cat_embedding)
)
scores.append((cat_key, float(similarity)))
scores.sort(key=lambda x: x[1], reverse=True)
return scores
def categorize_skill(self, skill: SkillMetadata,
use_embeddings: bool = False) -> Dict:
"""Categorize a single skill with confidence scores."""
# Get keyword scores
keyword_scores = self.categorize_by_keywords(skill)
# Get embedding scores if requested
embedding_scores = []
if use_embeddings and self.embeddings_available:
embedding_scores = self.categorize_by_embeddings(skill)
# Combine scores
combined = {}
all_cats = set([c for c, _ in keyword_scores] + [c for c, _ in embedding_scores])
for cat in all_cats:
kw_score = next((s for c, s in keyword_scores if c == cat), 0)
emb_score = next((s for c, s in embedding_scores if c == cat), 0)
if use_embeddings and embedding_scores:
# Weighted combination
combined[cat] = 0.6 * kw_score + 0.4 * emb_score
else:
combined[cat] = kw_score
# Get top categories
sorted_cats = sorted(combined.items(), key=lambda x: x[1], reverse=True)
primary_cat = sorted_cats[0] if sorted_cats else ("general", 0.0)
secondary_cat = sorted_cats[1] if len(sorted_cats) > 1 else (None, 0.0)
return {
"skill": skill.name,
"primary_category": primary_cat[0],
"primary_confidence": primary_cat[1],
"secondary_category": secondary_cat[0] if secondary_cat[1] > 0.3 else None,
"secondary_confidence": secondary_cat[1],
"all_scores": dict(sorted_cats[:5]),
"method": "hybrid" if (use_embeddings and embedding_scores) else "keyword",
}
def categorize_all(self, use_embeddings: bool = False) -> List[Dict]:
"""Categorize all skills."""
self.registry.load()
all_skills = self.registry.get_all_metadata()
results = []
for i, skill in enumerate(all_skills):
if i % 50 == 0:
logger.info(f"Categorized {i}/{len(all_skills)} skills...")
result = self.categorize_skill(skill, use_embeddings)
results.append(result)
return results
def update_skill_frontmatter(self, skill: SkillMetadata,
category: str, confidence: float) -> bool:
"""Update skill frontmatter with category."""
if not skill.skill_path:
return False
skill_md = skill.skill_path / "SKILL.md"
if not skill_md.exists():
return False
try:
content = skill_md.read_text()
# Parse existing frontmatter
if content.startswith("---"):
parts = content.split("---", 2)
if len(parts) >= 3:
frontmatter = yaml.safe_load(parts[1]) or {}
body = parts[2]
else:
frontmatter = {}
body = content
else:
frontmatter = {}
body = content
# Update category
frontmatter["category"] = category
frontmatter["category_confidence"] = round(confidence, 2)
frontmatter["category_method"] = "semantic-auto"
# Rebuild file
new_frontmatter = yaml.dump(frontmatter, default_flow_style=False, allow_unicode=True)
new_content = f"---\n{new_frontmatter}---{body}"
skill_md.write_text(new_content)
return True
except Exception as e:
logger.error(f"Failed to update {skill.name}: {e}")
return False
def generate_report(self, results: List[Dict]) -> Dict:
"""Generate categorization report."""
# Category distribution
cat_counts = {}
confidences = []
for r in results:
cat = r["primary_category"]
cat_counts[cat] = cat_counts.get(cat, 0) + 1
confidences.append(r["primary_confidence"])
avg_confidence = sum(confidences) / len(confidences) if confidences else 0
return {
"total_skills": len(results),
"categories_used": len(cat_counts),
"average_confidence": round(avg_confidence, 2),
"category_distribution": dict(sorted(cat_counts.items(), key=lambda x: -x[1])),
"high_confidence": sum(1 for c in confidences if c >= 0.7),
"medium_confidence": sum(1 for c in confidences if 0.4 <= c < 0.7),
"low_confidence": sum(1 for c in confidences if c < 0.4),
}
def main(): parser = argparse.ArgumentParser(description="Categorize CODITECT skills") parser.add_argument("--dry-run", action="store_true", help="Preview categorizations without updating") parser.add_argument("--update", action="store_true", help="Update skill frontmatters with categories") parser.add_argument("--use-embeddings", action="store_true", help="Use semantic embeddings (requires sentence-transformers)") parser.add_argument("--output", "-o", help="Save results to JSON file") parser.add_argument("--framework", help="Path to category framework YAML")
args = parser.parse_args()
if not args.dry_run and not args.update and not args.output:
parser.print_help()
sys.exit(1)
# Initialize categorizer
framework_path = Path(args.framework) if args.framework else None
categorizer = SemanticSkillCategorizer(framework_path)
# Check embeddings availability
if args.use_embeddings and not categorizer.embeddings_available:
logger.error("sentence-transformers not installed. Run: pip install sentence-transformers")
sys.exit(1)
# Categorize all skills
logger.info("Starting skill categorization...")
results = categorizer.categorize_all(use_embeddings=args.use_embeddings)
# Generate report
report = categorizer.generate_report(results)
print("\n" + "=" * 60)
print("CATEGORIZATION REPORT")
print("=" * 60)
print(f"Total skills: {report['total_skills']}")
print(f"Categories used: {report['categories_used']}")
print(f"Average confidence: {report['average_confidence']:.2f}")
print(f"\nHigh confidence (>=0.7): {report['high_confidence']}")
print(f"Medium confidence (0.4-0.7): {report['medium_confidence']}")
print(f"Low confidence (<0.4): {report['low_confidence']}")
print("\nCategory distribution:")
for cat, count in report['category_distribution'].items():
print(f" {cat:20} {count:3} skills")
# Update frontmatters if requested
if args.update:
print("\n" + "=" * 60)
print("UPDATING FRONTMATTERS")
print("=" * 60)
updated = 0
for result in results:
if result["primary_confidence"] >= 0.4: # Only update if confident enough
skill = categorizer.registry.get_skill(result["skill"])
if skill:
success = categorizer.update_skill_frontmatter(
skill,
result["primary_category"],
result["primary_confidence"]
)
if success:
updated += 1
print(f"Updated {updated}/{len(results)} skill frontmatters")
# Save results if requested
if args.output:
with open(args.output, 'w') as f:
json.dump({"report": report, "results": results}, f, indent=2)
print(f"\nResults saved to: {args.output}")
if name == "main": main()