Skip to main content

#!/usr/bin/env python3 """ Semantic Skill Categorization Script

Categorizes CODITECT skills using:

  1. Semantic embeddings (if available)
  2. Frontmatter analysis
  3. Keyword matching
  4. Content analysis

Usage: python3 categorize-skills.py --dry-run # Preview categorizations python3 categorize-skills.py --update # Update skill frontmatters python3 categorize-skills.py --use-embeddings # Use semantic embeddings

Track: ARCH.3 (Unified LLM Component Architecture) Author: CODITECT Framework Team Date: 2026-01-29 """

import argparse import json import logging import sys from pathlib import Path from typing import Dict, List, Optional, Tuple

import yaml

Add parent directory to path

sys.path.insert(0, str(Path(file).parent.parent / "tools" / "mcp-skill-server"))

from server import SkillRegistry, SkillMetadata

logging.basicConfig(level=logging.INFO) logger = logging.getLogger("categorize-skills")

class SemanticSkillCategorizer: """Categorizes skills using semantic analysis."""

def __init__(self, framework_path: Optional[Path] = None):
self.registry = SkillRegistry()
self.framework = self._load_framework(framework_path)
self.embeddings_available = self._check_embeddings()
self._embedding_model = None

def _load_framework(self, path: Optional[Path]) -> Dict:
"""Load category framework."""
if path is None:
path = Path(__file__).parent.parent / "config" / "skill-category-framework.yaml"

with open(path) as f:
return yaml.safe_load(f)

def _check_embeddings(self) -> bool:
"""Check if sentence-transformers is available."""
try:
import sentence_transformers
return True
except ImportError:
return False

def _get_embedding_model(self):
"""Get or load embedding model."""
if self._embedding_model is None and self.embeddings_available:
from sentence_transformers import SentenceTransformer
self._embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
return self._embedding_model

def extract_skill_text(self, skill: SkillMetadata) -> str:
"""Extract searchable text from skill."""
parts = [skill.name, skill.description]

# Read SKILL.md for more content
if skill.skill_path:
skill_md = skill.skill_path / "SKILL.md"
if skill_md.exists():
try:
content = skill_md.read_text()
# Get content after frontmatter
if content.startswith("---"):
parts_marker = content.split("---", 2)
if len(parts_marker) >= 3:
content = parts_marker[2]
# First 500 chars of content
parts.append(content[:500])
except Exception:
pass

return " ".join(parts).lower()

def categorize_by_keywords(self, skill: SkillMetadata) -> List[Tuple[str, float]]:
"""Categorize skill using keyword matching."""
text = self.extract_skill_text(skill)
scores = []

for cat_key, cat_data in self.framework["categories"].items():
keywords = cat_data.get("keywords", [])
score = sum(1 for kw in keywords if kw.lower() in text)
if score > 0:
# Normalize by keyword count
normalized = score / len(keywords) if keywords else 0
scores.append((cat_key, normalized))

scores.sort(key=lambda x: x[1], reverse=True)
return scores

def categorize_by_embeddings(self, skill: SkillMetadata) -> List[Tuple[str, float]]:
"""Categorize skill using semantic embeddings."""
if not self.embeddings_available:
return []

model = self._get_embedding_model()
if model is None:
return []

skill_text = self.extract_skill_text(skill)
skill_embedding = model.encode(skill_text)

scores = []
for cat_key, cat_data in self.framework["categories"].items():
# Build category description from exemplars
exemplars = cat_data.get("exemplars", [])
cat_text = f"{cat_data['name']}: {cat_data['description']}"
if exemplars:
cat_text += " " + " ".join(exemplars)

cat_embedding = model.encode(cat_text)

# Cosine similarity
import numpy as np
similarity = np.dot(skill_embedding, cat_embedding) / (
np.linalg.norm(skill_embedding) * np.linalg.norm(cat_embedding)
)
scores.append((cat_key, float(similarity)))

scores.sort(key=lambda x: x[1], reverse=True)
return scores

def categorize_skill(self, skill: SkillMetadata,
use_embeddings: bool = False) -> Dict:
"""Categorize a single skill with confidence scores."""
# Get keyword scores
keyword_scores = self.categorize_by_keywords(skill)

# Get embedding scores if requested
embedding_scores = []
if use_embeddings and self.embeddings_available:
embedding_scores = self.categorize_by_embeddings(skill)

# Combine scores
combined = {}
all_cats = set([c for c, _ in keyword_scores] + [c for c, _ in embedding_scores])

for cat in all_cats:
kw_score = next((s for c, s in keyword_scores if c == cat), 0)
emb_score = next((s for c, s in embedding_scores if c == cat), 0)

if use_embeddings and embedding_scores:
# Weighted combination
combined[cat] = 0.6 * kw_score + 0.4 * emb_score
else:
combined[cat] = kw_score

# Get top categories
sorted_cats = sorted(combined.items(), key=lambda x: x[1], reverse=True)

primary_cat = sorted_cats[0] if sorted_cats else ("general", 0.0)
secondary_cat = sorted_cats[1] if len(sorted_cats) > 1 else (None, 0.0)

return {
"skill": skill.name,
"primary_category": primary_cat[0],
"primary_confidence": primary_cat[1],
"secondary_category": secondary_cat[0] if secondary_cat[1] > 0.3 else None,
"secondary_confidence": secondary_cat[1],
"all_scores": dict(sorted_cats[:5]),
"method": "hybrid" if (use_embeddings and embedding_scores) else "keyword",
}

def categorize_all(self, use_embeddings: bool = False) -> List[Dict]:
"""Categorize all skills."""
self.registry.load()
all_skills = self.registry.get_all_metadata()

results = []
for i, skill in enumerate(all_skills):
if i % 50 == 0:
logger.info(f"Categorized {i}/{len(all_skills)} skills...")

result = self.categorize_skill(skill, use_embeddings)
results.append(result)

return results

def update_skill_frontmatter(self, skill: SkillMetadata,
category: str, confidence: float) -> bool:
"""Update skill frontmatter with category."""
if not skill.skill_path:
return False

skill_md = skill.skill_path / "SKILL.md"
if not skill_md.exists():
return False

try:
content = skill_md.read_text()

# Parse existing frontmatter
if content.startswith("---"):
parts = content.split("---", 2)
if len(parts) >= 3:
frontmatter = yaml.safe_load(parts[1]) or {}
body = parts[2]
else:
frontmatter = {}
body = content
else:
frontmatter = {}
body = content

# Update category
frontmatter["category"] = category
frontmatter["category_confidence"] = round(confidence, 2)
frontmatter["category_method"] = "semantic-auto"

# Rebuild file
new_frontmatter = yaml.dump(frontmatter, default_flow_style=False, allow_unicode=True)
new_content = f"---\n{new_frontmatter}---{body}"

skill_md.write_text(new_content)
return True

except Exception as e:
logger.error(f"Failed to update {skill.name}: {e}")
return False

def generate_report(self, results: List[Dict]) -> Dict:
"""Generate categorization report."""
# Category distribution
cat_counts = {}
confidences = []

for r in results:
cat = r["primary_category"]
cat_counts[cat] = cat_counts.get(cat, 0) + 1
confidences.append(r["primary_confidence"])

avg_confidence = sum(confidences) / len(confidences) if confidences else 0

return {
"total_skills": len(results),
"categories_used": len(cat_counts),
"average_confidence": round(avg_confidence, 2),
"category_distribution": dict(sorted(cat_counts.items(), key=lambda x: -x[1])),
"high_confidence": sum(1 for c in confidences if c >= 0.7),
"medium_confidence": sum(1 for c in confidences if 0.4 <= c < 0.7),
"low_confidence": sum(1 for c in confidences if c < 0.4),
}

def main(): parser = argparse.ArgumentParser(description="Categorize CODITECT skills") parser.add_argument("--dry-run", action="store_true", help="Preview categorizations without updating") parser.add_argument("--update", action="store_true", help="Update skill frontmatters with categories") parser.add_argument("--use-embeddings", action="store_true", help="Use semantic embeddings (requires sentence-transformers)") parser.add_argument("--output", "-o", help="Save results to JSON file") parser.add_argument("--framework", help="Path to category framework YAML")

args = parser.parse_args()

if not args.dry_run and not args.update and not args.output:
parser.print_help()
sys.exit(1)

# Initialize categorizer
framework_path = Path(args.framework) if args.framework else None
categorizer = SemanticSkillCategorizer(framework_path)

# Check embeddings availability
if args.use_embeddings and not categorizer.embeddings_available:
logger.error("sentence-transformers not installed. Run: pip install sentence-transformers")
sys.exit(1)

# Categorize all skills
logger.info("Starting skill categorization...")
results = categorizer.categorize_all(use_embeddings=args.use_embeddings)

# Generate report
report = categorizer.generate_report(results)

print("\n" + "=" * 60)
print("CATEGORIZATION REPORT")
print("=" * 60)
print(f"Total skills: {report['total_skills']}")
print(f"Categories used: {report['categories_used']}")
print(f"Average confidence: {report['average_confidence']:.2f}")
print(f"\nHigh confidence (>=0.7): {report['high_confidence']}")
print(f"Medium confidence (0.4-0.7): {report['medium_confidence']}")
print(f"Low confidence (<0.4): {report['low_confidence']}")
print("\nCategory distribution:")
for cat, count in report['category_distribution'].items():
print(f" {cat:20} {count:3} skills")

# Update frontmatters if requested
if args.update:
print("\n" + "=" * 60)
print("UPDATING FRONTMATTERS")
print("=" * 60)

updated = 0
for result in results:
if result["primary_confidence"] >= 0.4: # Only update if confident enough
skill = categorizer.registry.get_skill(result["skill"])
if skill:
success = categorizer.update_skill_frontmatter(
skill,
result["primary_category"],
result["primary_confidence"]
)
if success:
updated += 1

print(f"Updated {updated}/{len(results)} skill frontmatters")

# Save results if requested
if args.output:
with open(args.output, 'w') as f:
json.dump({"report": report, "results": results}, f, indent=2)
print(f"\nResults saved to: {args.output}")

if name == "main": main()