Skip to main content

#!/usr/bin/env python3 """ CODITECT QA Common Library

Shared utilities for all component type graders. Used by: grade-agents.py, grade-skills.py, grade-commands.py, grade-hooks.py, grade-scripts.py, grade-workflows.py, grade-tools.py, grade-all.py

ADR-161: Component Quality Assurance Framework """

import os import re import json import yaml import sys from pathlib import Path from collections import defaultdict from datetime import datetime, timedelta

Valid Anthropic tools for CODITECT components

VALID_TOOLS = { "Read", "Write", "Edit", "Bash", "Grep", "Glob", "LS", "TodoWrite", "WebSearch", "WebFetch", "Task", "Skill", "NotebookEdit", "ExitPlanMode" }

VALID_MODELS = {"sonnet", "opus", "haiku"}

Domain-specific terms for specificity scoring

DOMAIN_TERMS = { "api", "endpoint", "middleware", "authentication", "authorization", "database", "schema", "migration", "query", "index", "component", "module", "service", "handler", "controller", "deploy", "kubernetes", "docker", "container", "ci/cd", "test", "assertion", "mock", "fixture", "coverage", "security", "encryption", "token", "oauth", "rbac", "agent", "skill", "command", "hook", "workflow", "frontmatter", "yaml", "markdown", "template", "coditect", "moe", "cef", "pilot", "track", "validation", "grading", "compliance", "standard", "react", "typescript", "python", "rust", "django", "gcp", "aws", "terraform", "helm", "grafana", "sqlite", "postgresql", "redis", "elasticsearch", }

Generic/filler words to exclude from specificity

GENERIC_WORDS = { "the", "a", "an", "is", "are", "was", "were", "be", "been", "have", "has", "had", "do", "does", "did", "will", "would", "could", "should", "may", "might", "can", "shall", "must", "this", "that", "these", "those", "it", "its", "they", "them", "and", "or", "but", "not", "no", "if", "then", "else", "for", "of", "in", "on", "at", "to", "from", "by", "with", "as", "so", "very", "just", "also", "more", "most", "some", "all", "any", "each", "every", "other", "such", "than", "about", "into", "through", "during", "before", "after", }

def parse_frontmatter(content): """Extract YAML frontmatter from markdown content.

Returns:
tuple: (frontmatter_dict, body_text)
"""
match = re.match(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL)
if not match:
return {}, content
try:
fm = yaml.safe_load(match.group(1))
if fm is None:
fm = {}
body = content[match.end():]
return fm, body
except yaml.YAMLError:
return {}, content

def count_words(text): """Count words in text.""" return len(text.split())

def section_finder(body, patterns): """Find H2/H3 sections matching regex patterns.

Args:
body: Markdown body text
patterns: List of regex patterns to match against section headings

Returns:
dict: {pattern: bool} indicating which patterns matched
"""
body_lower = body.lower()
results = {}
for pattern in patterns:
results[pattern] = bool(re.search(pattern, body_lower))
return results

def content_quality_score(text): """Compute content quality heuristic score.

Returns a dict with:
- specificity: ratio of domain terms to total (0.0-1.0)
- code_quality: whether code examples appear runnable (0 or 1)
- instruction_density: imperative verbs per 100 words
"""
words = text.lower().split()
word_count = len(words)

if word_count == 0:
return {"specificity": 0.0, "code_quality": 0, "instruction_density": 0.0}

# Specificity: domain terms / (total - generic)
meaningful_words = [w for w in words if w not in GENERIC_WORDS and len(w) > 2]
domain_hits = sum(1 for w in meaningful_words if w in DOMAIN_TERMS)
specificity = domain_hits / max(len(meaningful_words), 1)

# Code quality: check for runnable patterns in code blocks
code_blocks = re.findall(r'```[\w]*\n(.*?)```', text, re.DOTALL)
code_quality = 0
if code_blocks:
for block in code_blocks:
# Check for import/from statements, function calls, assignments
if re.search(r'(import |from |def |class |function |const |let |var |\w+\()', block):
code_quality = 1
break

# Instruction density: imperative verbs per 100 words
imperative_patterns = r'\b(create|implement|add|remove|update|configure|set|run|execute|validate|check|ensure|verify|test|deploy|build|install|use|read|write|fix|patch)\b'
imperative_count = len(re.findall(imperative_patterns, text.lower()))
instruction_density = (imperative_count / word_count) * 100 if word_count > 0 else 0

return {
"specificity": round(specificity, 3),
"code_quality": code_quality,
"instruction_density": round(instruction_density, 2),
}

def check_links(body, base_dir): """Validate internal file references exist on disk.

Args:
body: Markdown body text
base_dir: Base directory for resolving relative paths

Returns:
dict: {link: exists_bool}
"""
# Find markdown links and file references
links = re.findall(r'\[.*?\]\(((?!http)[^)]+)\)', body)
# Also find backtick file references
file_refs = re.findall(r'`((?:[\w-]+/)+[\w.-]+)`', body)

results = {}
for link in links + file_refs:
# Clean up the path
clean = link.strip().split('#')[0].split('?')[0]
if not clean:
continue
full_path = os.path.join(base_dir, clean)
results[clean] = os.path.exists(full_path)

return results

def check_staleness(fm, filepath, max_days=90): """Check if a file's updated date is stale relative to mtime.

Returns:
bool: True if stale (updated date > max_days ago or doesn't match mtime)
"""
updated = fm.get('updated', '')
if not updated:
return True # No update date = stale

try:
if isinstance(updated, str):
update_date = datetime.strptime(updated, '%Y-%m-%d')
elif isinstance(updated, datetime):
update_date = updated
else:
return True

age = datetime.now() - update_date
return age.days > max_days
except (ValueError, TypeError):
return True

def grade_from_score(score): """Convert numeric score to letter grade.

Args:
score: 0-100 numeric score

Returns:
str: Letter grade (A, B, C, D, F)
"""
if score >= 90:
return 'A'
elif score >= 80:
return 'B'
elif score >= 70:
return 'C'
elif score >= 60:
return 'D'
else:
return 'F'

def compute_weighted_score(scores, categories): """Compute weighted score from check results and category definitions.

Args:
scores: dict of {check_name: 0_or_1}
categories: list of (category_name, weight, [check_names])

Returns:
tuple: (total_score, category_scores_dict)
"""
category_scores = {}
total = 0.0

for cat_name, weight, checks in categories:
if not checks:
category_scores[cat_name] = 0.0
continue
passed = sum(scores.get(c, 0) for c in checks)
cat_score = (passed / len(checks)) * weight
category_scores[cat_name] = round(cat_score, 1)
total += cat_score

return round(total, 1), category_scores

def aggregate_results(results, component_type): """Aggregate a list of graded component results into a summary.

Args:
results: List of dicts from individual grading
component_type: str like 'agents', 'skills', etc.

Returns:
dict: Summary with grade distribution, attribute pass rates, etc.
"""
total = len(results)
if total == 0:
return {
"component_type": component_type,
"summary": {"total_components": 0, "average_score": 0, "grade_distribution": {}, "errors": 0},
"attribute_pass_rates": {},
"components": [],
"errors": [],
}

grades = defaultdict(int)
for r in results:
grades[r['grade']] += 1

avg_score = sum(r['total_base'] for r in results) / total

# Attribute pass rates
all_attrs = set()
for r in results:
all_attrs.update(r['scores'].keys())

attr_pass_rates = {}
for attr in sorted(all_attrs):
passed = sum(1 for r in results if r['scores'].get(attr, 0) == 1)
attr_pass_rates[attr] = {
'passed': passed,
'failed': total - passed,
'rate': round(passed / total * 100, 1)
}

return {
"component_type": component_type,
"summary": {
"total_components": total,
"average_score": round(avg_score, 1),
"grade_distribution": dict(grades),
"errors": 0,
},
"attribute_pass_rates": attr_pass_rates,
"components": results,
"errors": [],
}

def output_results(data, output_path=None, format='json'): """Write results to file or stdout.

Args:
data: Dict of results to output
output_path: File path (None for stdout)
format: 'json' or 'summary'
"""
if format == 'json':
content = json.dumps(data, indent=2)
if output_path:
with open(output_path, 'w') as f:
f.write(content)
else:
print(content)
elif format == 'summary':
summary = data.get('summary', {})
print(f"\n{'='*60}")
print(f"CODITECT QA REPORT - {summary.get('total_components', 0)} {data.get('component_type', 'Components')} Graded")
print(f"{'='*60}")
print(f"\nAverage Score: {summary.get('average_score', 0):.1f}%")
print(f"\nGrade Distribution:")
for g in ['A', 'B', 'C', 'D', 'F']:
count = summary.get('grade_distribution', {}).get(g, 0)
total = summary.get('total_components', 1)
pct = count / total * 100 if total else 0
bar = '#' * int(pct / 2)
print(f" {g}: {count:4d} ({pct:5.1f}%) {bar}")

def parse_tools_field(tools_raw): """Parse tools field from frontmatter (string or list).

Returns:
list: List of tool name strings
"""
if isinstance(tools_raw, list):
return [str(t).strip() for t in tools_raw]
elif isinstance(tools_raw, str):
return [t.strip() for t in tools_raw.split(',')]
return []

def validate_tools(tools_list): """Check if all tools in list are valid Anthropic tools.

Returns:
bool: True if all tools are valid
"""
return len(tools_list) > 0 and all(t in VALID_TOOLS for t in tools_list if t)