scripts-metadata
""" Metadata Analyst Agent
Analyzes document classification based on:
- YAML frontmatter fields
- Explicit type declarations
- Tags and keywords
- Status and audience fields """
from pathlib import Path import time from typing import Dict, List
import sys sys.path.insert(0, str(Path(file).parent.parent))
from core.models import Document, AnalystVote, DocumentType from analysts.base import BaseAnalyst
class MetadataAnalyst(BaseAnalyst): """Analyst that classifies based on frontmatter metadata."""
name = "metadata"
# Tag patterns that indicate document types
TAG_PATTERNS: Dict[str, List[str]] = {
'agent': ['agent', 'ai-agent', 'specialist', 'autonomous', 'subagent'],
'command': ['command', 'cli', 'slash-command', 'invocation'],
'skill': ['skill', 'capability', 'pattern', 'reusable'],
'script': ['script', 'automation', 'python', 'bash', 'shell'],
'hook': ['hook', 'trigger', 'event', 'pre-commit', 'post-commit'],
'guide': ['guide', 'tutorial', 'how-to', 'walkthrough', 'getting-started'],
'workflow': ['workflow', 'pipeline', 'process', 'automation'],
'adr': ['adr', 'decision', 'architecture-decision', 'architectural'],
'reference': ['reference', 'api', 'specification', 'schema', 'documentation'],
'config': ['config', 'configuration', 'settings', 'options'],
}
# Keywords in title that indicate types
TITLE_KEYWORDS: Dict[str, List[str]] = {
'adr': ['ADR-', 'Architecture Decision', 'Decision Record'],
'guide': ['Guide', 'Tutorial', 'How to', 'Getting Started', 'Quick Start'],
'workflow': ['Workflow', 'Pipeline', 'Process'],
'agent': ['Agent', 'Specialist'],
'command': ['Command'],
'reference': ['Reference', 'Specification', 'API', 'Schema'],
}
def analyze(self, document: Document) -> AnalystVote:
"""Analyze document based on frontmatter metadata."""
start = time.time()
fm = document.frontmatter
scores: Dict[str, float] = {}
reasons: List[str] = []
# Check 1: Explicit type field (highest confidence)
if 'type' in fm:
explicit_type = str(fm['type']).lower()
if explicit_type in [t.value for t in DocumentType]:
scores[explicit_type] = 0.99
reasons.append(f"Explicit type='{explicit_type}' in frontmatter")
# Check 2: Tags analysis
tags = fm.get('tags', []) or []
if isinstance(tags, str):
tags = [tags]
tags_lower = [str(t).lower() for t in tags]
for doc_type, type_tags in self.TAG_PATTERNS.items():
matches = sum(1 for t in tags_lower if any(pt in t for pt in type_tags))
if matches > 0:
tag_score = min(0.90, 0.30 + (matches * 0.15))
if doc_type not in scores or scores[doc_type] < tag_score:
scores[doc_type] = tag_score
reasons.append(f"Tags match {doc_type} pattern ({matches} matches)")
# Check 3: Keywords analysis
keywords = fm.get('keywords', []) or []
if isinstance(keywords, str):
keywords = [keywords]
for kw in keywords:
kw_lower = str(kw).lower()
for doc_type, type_tags in self.TAG_PATTERNS.items():
if any(pt in kw_lower for pt in type_tags):
scores[doc_type] = max(scores.get(doc_type, 0), 0.70)
# Check 4: Title analysis
title = fm.get('title', '')
for doc_type, title_keywords in self.TITLE_KEYWORDS.items():
for keyword in title_keywords:
if keyword.lower() in title.lower():
title_score = 0.75
if doc_type not in scores or scores[doc_type] < title_score:
scores[doc_type] = title_score
reasons.append(f"Title contains '{keyword}'")
# Check 5: ADR number field
if 'adr_number' in fm:
scores['adr'] = max(scores.get('adr', 0), 0.95)
reasons.append("Has adr_number field")
# Check 6: Status field (weak indicator)
status = fm.get('status', '')
if status in ['draft', 'active', 'deprecated', 'archived']:
# Having a valid status slightly boosts reference types
for t in ['reference', 'guide', 'adr']:
scores[t] = scores.get(t, 0.5) + 0.05
# Check 7: Audience field
audience = fm.get('audience', '')
if audience == 'contributor':
scores['adr'] = scores.get('adr', 0.5) + 0.10
scores['reference'] = scores.get('reference', 0.5) + 0.05
elif audience == 'customer':
scores['guide'] = scores.get('guide', 0.5) + 0.10
# Determine best classification
if scores:
best_type = max(scores, key=scores.get)
confidence = min(0.99, scores[best_type])
reasoning = "; ".join(reasons[:3]) if reasons else "Metadata analysis"
else:
# No frontmatter or no indicators
best_type = 'reference'
confidence = 0.40
reasoning = "No frontmatter metadata found, low confidence default"
duration_ms = int((time.time() - start) * 1000)
return self._create_vote(
classification=best_type,
confidence=confidence,
reasoning=reasoning,
duration_ms=duration_ms,
metadata={
'has_frontmatter': bool(fm),
'has_explicit_type': 'type' in fm,
'tag_count': len(tags),
'fields_present': list(fm.keys()) if fm else [],
'all_scores': {k: round(v, 3) for k, v in scores.items()}
}
)