scripts-metadata

""" Metadata Analyst Agent
Analyzes document classification based on:
YAML frontmatter fields
Explicit type declarations
Tags and keywords
Status and audience fields """
from pathlib import Path import time from typing import Dict, List
import sys sys.path.insert(0, str(Path(file).parent.parent))
from core.models import Document, AnalystVote, DocumentType from analysts.base import BaseAnalyst
class MetadataAnalyst(BaseAnalyst): """Analyst that classifies based on frontmatter metadata."""
name = "metadata"

# Tag patterns that indicate document types
TAG_PATTERNS: Dict[str, List[str]] = {
    'agent': ['agent', 'ai-agent', 'specialist', 'autonomous', 'subagent'],
    'command': ['command', 'cli', 'slash-command', 'invocation'],
    'skill': ['skill', 'capability', 'pattern', 'reusable'],
    'script': ['script', 'automation', 'python', 'bash', 'shell'],
    'hook': ['hook', 'trigger', 'event', 'pre-commit', 'post-commit'],
    'guide': ['guide', 'tutorial', 'how-to', 'walkthrough', 'getting-started'],
    'workflow': ['workflow', 'pipeline', 'process', 'automation'],
    'adr': ['adr', 'decision', 'architecture-decision', 'architectural'],
    'reference': ['reference', 'api', 'specification', 'schema', 'documentation'],
    'config': ['config', 'configuration', 'settings', 'options'],
}

# Keywords in title that indicate types
TITLE_KEYWORDS: Dict[str, List[str]] = {
    'adr': ['ADR-', 'Architecture Decision', 'Decision Record'],
    'guide': ['Guide', 'Tutorial', 'How to', 'Getting Started', 'Quick Start'],
    'workflow': ['Workflow', 'Pipeline', 'Process'],
    'agent': ['Agent', 'Specialist'],
    'command': ['Command'],
    'reference': ['Reference', 'Specification', 'API', 'Schema'],
}

def analyze(self, document: Document) -> AnalystVote:
    """Analyze document based on frontmatter metadata."""
    start = time.time()

    fm = document.frontmatter
    scores: Dict[str, float] = {}
    reasons: List[str] = []

    # Check 1: Explicit type field (highest confidence)
    if 'type' in fm:
        explicit_type = str(fm['type']).lower()
        if explicit_type in [t.value for t in DocumentType]:
            scores[explicit_type] = 0.99
            reasons.append(f"Explicit type='{explicit_type}' in frontmatter")

    # Check 2: Tags analysis
    tags = fm.get('tags', []) or []
    if isinstance(tags, str):
        tags = [tags]

    tags_lower = [str(t).lower() for t in tags]

    for doc_type, type_tags in self.TAG_PATTERNS.items():
        matches = sum(1 for t in tags_lower if any(pt in t for pt in type_tags))
        if matches > 0:
            tag_score = min(0.90, 0.30 + (matches * 0.15))
            if doc_type not in scores or scores[doc_type] < tag_score:
                scores[doc_type] = tag_score
                reasons.append(f"Tags match {doc_type} pattern ({matches} matches)")

    # Check 3: Keywords analysis
    keywords = fm.get('keywords', []) or []
    if isinstance(keywords, str):
        keywords = [keywords]

    for kw in keywords:
        kw_lower = str(kw).lower()
        for doc_type, type_tags in self.TAG_PATTERNS.items():
            if any(pt in kw_lower for pt in type_tags):
                scores[doc_type] = max(scores.get(doc_type, 0), 0.70)

    # Check 4: Title analysis
    title = fm.get('title', '')
    for doc_type, title_keywords in self.TITLE_KEYWORDS.items():
        for keyword in title_keywords:
            if keyword.lower() in title.lower():
                title_score = 0.75
                if doc_type not in scores or scores[doc_type] < title_score:
                    scores[doc_type] = title_score
                    reasons.append(f"Title contains '{keyword}'")

    # Check 5: ADR number field
    if 'adr_number' in fm:
        scores['adr'] = max(scores.get('adr', 0), 0.95)
        reasons.append("Has adr_number field")

    # Check 6: Status field (weak indicator)
    status = fm.get('status', '')
    if status in ['draft', 'active', 'deprecated', 'archived']:
        # Having a valid status slightly boosts reference types
        for t in ['reference', 'guide', 'adr']:
            scores[t] = scores.get(t, 0.5) + 0.05

    # Check 7: Audience field
    audience = fm.get('audience', '')
    if audience == 'contributor':
        scores['adr'] = scores.get('adr', 0.5) + 0.10
        scores['reference'] = scores.get('reference', 0.5) + 0.05
    elif audience == 'customer':
        scores['guide'] = scores.get('guide', 0.5) + 0.10

    # Determine best classification
    if scores:
        best_type = max(scores, key=scores.get)
        confidence = min(0.99, scores[best_type])
        reasoning = "; ".join(reasons[:3]) if reasons else "Metadata analysis"
    else:
        # No frontmatter or no indicators
        best_type = 'reference'
        confidence = 0.40
        reasoning = "No frontmatter metadata found, low confidence default"

    duration_ms = int((time.time() - start) * 1000)

    return self._create_vote(
        classification=best_type,
        confidence=confidence,
        reasoning=reasoning,
        duration_ms=duration_ms,
        metadata={
            'has_frontmatter': bool(fm),
            'has_explicit_type': 'type' in fm,
            'tag_count': len(tags),
            'fields_present': list(fm.keys()) if fm else [],
            'all_scores': {k: round(v, 3) for k, v in scores.items()}
        }
    )