scripts-evaluate-skill-standard

#!/usr/bin/env python3 """ Evaluate CODITECT skills against Anthropic January 2026 standards.

Usage: python3 evaluate-skill-standard.py # Evaluate all skills python3 evaluate-skill-standard.py path/to/SKILL.md # Evaluate single skill python3 evaluate-skill-standard.py --summary # Summary only python3 evaluate-skill-standard.py --failing # Show only failing skills python3 evaluate-skill-standard.py --json # Output as JSON """

import sys import yaml import re import json from pathlib import Path from typing import Dict, List, Any from dataclasses import dataclass, asdict

@dataclass class SkillEvaluation: """Evaluation result for a single skill.""" name: str path: str line_count: int scores: Dict[str, int] total: int max_score: int average: float critical_failures: List[str] meets_standard: bool

def evaluate_skill(skill_path: Path) -> SkillEvaluation: """Evaluate skill against Anthropic January 2026 standards.""" content = skill_path.read_text() lines = content.split('\n')

# Parse frontmatter
fm_match = re.match(r'^---\n(.*?)\n---', content, re.DOTALL)
frontmatter = {}
if fm_match:
    try:
        frontmatter = yaml.safe_load(fm_match.group(1)) or {}
    except yaml.YAMLError:
        pass

scores = {}

# Get skill name
name = frontmatter.get('name', skill_path.parent.name)

# 1. Gerund naming (0 or 5) - CRITICAL
# Must end in -ing or have -ing- in the name
is_gerund = (
    name.endswith('ing') or
    '-ing-' in name or
    name.endswith('ing-patterns') or
    name.endswith('ing-strategies')
)
scores['gerund_naming'] = 5 if is_gerund else 0

# 2. Third-person description (0-5)
desc = frontmatter.get('description', '')
# Third-person typically starts with verb ending in 's' (e.g., "Extracts", "Processes")
third_person_patterns = [
    r'^[A-Z][a-z]+s\s',  # "Extracts text..."
    r'^[A-Z][a-z]+es\s',  # "Processes files..."
    r'^[A-Z][a-z]+ies\s',  # "Identifies issues..."
]
third_person = any(re.match(p, desc) for p in third_person_patterns)
# Also check for trigger phrases
has_triggers = 'Use when' in desc or 'when' in desc.lower()
scores['third_person_desc'] = 5 if (third_person and has_triggers) else (3 if third_person or has_triggers else 1)

# 3. Line count (0 or 5) - CRITICAL
line_count = len(lines)
scores['under_500_lines'] = 5 if line_count <= 500 else 0

# 4. Opinionated (prescribes THE way) (0-5)
opinionated_phrases = [
    'THE way', 'prescribes', 'opinionated', 'best practice',
    'This skill encodes', 'MUST', 'ALWAYS', 'NEVER'
]
opinionated_count = sum(1 for p in opinionated_phrases if p in content)
scores['opinionated'] = min(5, opinionated_count + 1)

# 5. Quality gates at each phase (0-5)
gate_patterns = [
    r'Quality Gate:', r'Phase \d+ Complete:',
    r'- \[ \]', r'Completion Checklist'
]
gates_found = sum(1 for p in gate_patterns if re.search(p, content))
scores['quality_gates'] = min(5, gates_found + 1)

# 6. Specific examples with input/output (0-5)
has_example_section = bool(re.search(r'##.*Example', content))
has_input = 'Input:' in content or 'Input (' in content or '**Input:**' in content
has_output = 'Output:' in content or 'Output (' in content or '**Output:**' in content
example_score = 0
if has_example_section: example_score += 2
if has_input: example_score += 1
if has_output: example_score += 1
if has_input and has_output: example_score += 1
scores['specific_examples'] = min(5, example_score)

# 7. Self-validating checklist (0-5)
checklist_patterns = [
    r'- \[ \]', r'Completion Checklist', r'Self-Validation',
    r'Before marking', r'verify:'
]
checklist_found = sum(1 for p in checklist_patterns if re.search(p, content, re.IGNORECASE))
scores['self_validating'] = min(5, checklist_found + 1)

# 8. When to Use with triggers (0-5)
has_when_to_use = bool(re.search(r'##.*When to Use', content))
has_do_not_use = 'Do NOT use' in content or "Don't use" in content or 'NOT use' in content
trigger_score = 0
if has_when_to_use: trigger_score += 2
if has_do_not_use: trigger_score += 2
if '✅' in content or '❌' in content: trigger_score += 1
scores['trigger_phrases'] = min(5, trigger_score)

# 9. Troubleshooting section (0-5)
has_troubleshooting = bool(re.search(r'##.*Troubleshoot', content, re.IGNORECASE))
has_anti_patterns = 'Anti-Pattern' in content or 'anti-pattern' in content
trouble_score = 0
if has_troubleshooting: trouble_score += 3
if has_anti_patterns: trouble_score += 2
scores['troubleshooting'] = min(5, trouble_score)

# Calculate totals
total = sum(scores.values())
max_score = len(scores) * 5
average = round(total / len(scores), 2)

# Identify critical failures
critical_failures = []
if scores['gerund_naming'] == 0:
    critical_failures.append(f'NOT_GERUND ({name})')
if scores['under_500_lines'] == 0:
    critical_failures.append(f'OVER_500_LINES ({line_count})')

return SkillEvaluation(
    name=name,
    path=str(skill_path),
    line_count=line_count,
    scores=scores,
    total=total,
    max_score=max_score,
    average=average,
    critical_failures=critical_failures,
    meets_standard=average >= 4.5 and len(critical_failures) == 0
)

def evaluate_all_skills(skills_dir: Path) -> List[SkillEvaluation]: """Evaluate all skills in directory.""" results = [] for skill_md in sorted(skills_dir.glob('*/SKILL.md')): try: result = evaluate_skill(skill_md) results.append(result) except Exception as e: print(f"Error evaluating {skill_md}: {e}", file=sys.stderr) return results

def print_summary(results: List[SkillEvaluation]): """Print summary of evaluation results.""" passing = [r for r in results if r.meets_standard] failing = [r for r in results if not r.meets_standard]

# Count critical failures
not_gerund = [r for r in results if any('NOT_GERUND' in f for f in r.critical_failures)]
over_500 = [r for r in results if any('OVER_500' in f for f in r.critical_failures)]

print("=" * 60)
print("CODITECT Skill Standards Evaluation (Anthropic Jan 2026)")
print("=" * 60)
print(f"\nTotal skills evaluated: {len(results)}")
print(f"Passing (≥4.5, no critical): {len(passing)} ({100*len(passing)//len(results)}%)")
print(f"Failing: {len(failing)} ({100*len(failing)//len(results)}%)")
print(f"\nCritical Failures:")
print(f"  Not gerund naming: {len(not_gerund)}")
print(f"  Over 500 lines: {len(over_500)}")

# Score distribution
scores = [r.average for r in results]
print(f"\nScore Distribution:")
print(f"  5.0: {sum(1 for s in scores if s >= 5.0)}")
print(f"  4.5-4.9: {sum(1 for s in scores if 4.5 <= s < 5.0)}")
print(f"  4.0-4.4: {sum(1 for s in scores if 4.0 <= s < 4.5)}")
print(f"  3.5-3.9: {sum(1 for s in scores if 3.5 <= s < 4.0)}")
print(f"  3.0-3.4: {sum(1 for s in scores if 3.0 <= s < 3.5)}")
print(f"  <3.0: {sum(1 for s in scores if s < 3.0)}")

avg_score = sum(scores) / len(scores) if scores else 0
print(f"\nAverage score: {avg_score:.2f}/5.0")

def print_failing(results: List[SkillEvaluation], limit: int = 20): """Print skills that need work, sorted by score.""" failing = [r for r in results if not r.meets_standard] failing_sorted = sorted(failing, key=lambda x: x.average)

print(f"\nTop {limit} Skills Needing Work:")
print("-" * 80)
print(f"{'Skill':<40} {'Score':>6} {'Lines':>6}  Critical Failures")
print("-" * 80)

for r in failing_sorted[:limit]:
    failures = ', '.join(r.critical_failures) if r.critical_failures else '-'
    print(f"{r.name:<40} {r.average:>5.2f} {r.line_count:>6}  {failures}")

def main(): args = sys.argv[1:]

# ADR-114: Skills are in framework installation (~/.coditect)
skills_dir = Path.home() / '.coditect' / 'skills'

if not args or '--summary' in args or '--failing' in args or '--json' in args:
    # Evaluate all skills
    results = evaluate_all_skills(skills_dir)

    if '--json' in args:
        print(json.dumps([asdict(r) for r in results], indent=2))
    else:
        print_summary(results)
        if '--failing' in args or not '--summary' in args:
            print_failing(results)
else:
    # Evaluate single skill
    skill_path = Path(args[0])
    if not skill_path.exists():
        print(f"Error: {skill_path} not found", file=sys.stderr)
        sys.exit(1)

    result = evaluate_skill(skill_path)
    print(f"Skill: {result.name}")
    print(f"Path: {result.path}")
    print(f"Lines: {result.line_count}")
    print(f"\nScores:")
    for criterion, score in result.scores.items():
        status = "✅" if score >= 4 else "⚠️" if score >= 2 else "❌"
        print(f"  {status} {criterion}: {score}/5")
    print(f"\nTotal: {result.total}/{result.max_score}")
    print(f"Average: {result.average}/5.0")
    print(f"Critical failures: {result.critical_failures or 'None'}")
    print(f"Meets standard: {'✅ Yes' if result.meets_standard else '❌ No'}")

if name == 'main': main()