scripts-reference-expert

""" Reference Type Expert

Specializes in understanding what makes a document a "reference" document. Reference docs provide technical specifications, API documentation, and lookup information.

Key characteristics:

API endpoints and methods
Configuration options tables
Schema definitions
Technical specifications
Lookup-oriented structure (not instructional) """

import re from typing import Dict, List, Optional from pathlib import Path import sys

sys.path.insert(0, str(Path(file).parent.parent)) from core.models import Document, AnalystVote from .base import TypeExpert, TypeAnalysis, ContentEnhancement

class ReferenceExpert(TypeExpert): """Expert in identifying and enhancing reference documents."""

expert_type = "reference"

# Strong semantic indicators of a reference document
strong_indicators = [
    r'api\s*reference',
    r'specification',
    r'schema',
    r'configuration\s*options?',
    r'parameters?\s*table',
    r'endpoint',
    r'method\s*signature',
]

# What each analyst looks for in references
analyst_expectations = {
    'metadata': [
        "type: reference in frontmatter",
        "component_type: reference",
    ],
    'content': [
        "## API Reference section",
        "## Schema section",
        "## Configuration section",
        "Tables with options/parameters",
    ],
    'structural': [
        "Path contains /reference/ or /docs/",
        "Table-heavy structure",
        "Organized by topic/component",
    ],
    'semantic': [
        "Technical specification language",
        "Declarative not instructional",
        "Lookup-oriented content",
    ],
    'pattern': [
        "Filename suggests reference (OVERVIEW, REFERENCE, SPEC)",
        "Title is descriptive not action-oriented",
    ],
}

def analyze(
    self,
    document: Document,
    analyst_votes: List[AnalystVote]
) -> TypeAnalysis:
    """Analyze if document is truly a reference document."""
    content = document.body or document.content
    headings = self.extract_headings(content)
    h2_texts = [h[1].lower() for h in headings if h[0] == 2]

    # Gather evidence
    evidence_for = []
    evidence_against = []

    # Check strong indicators
    for indicator in self.strong_indicators:
        if re.search(indicator, content, re.I):
            evidence_for.append(f"Contains reference indicator: '{indicator}'")

    # Check for tables (references are table-heavy)
    table_count = len(re.findall(r'\|.*\|.*\|', content))
    if table_count >= 3:
        evidence_for.append(f"Has {table_count} tables (reference docs are table-heavy)")
    elif table_count >= 1:
        evidence_for.append(f"Has {table_count} table(s)")

    # Check for reference-style sections
    ref_sections = ['api', 'schema', 'configuration', 'specification', 'overview', 'reference']
    for section in ref_sections:
        if any(section in h for h in h2_texts):
            evidence_for.append(f"Has {section} section")

    # Check for technical patterns
    if re.search(r'`[A-Z_]+`', content):  # Constants/enums
        evidence_for.append("Contains technical constants/enums")
    if re.search(r'type:\s*\w+', content):  # Type definitions
        evidence_for.append("Contains type definitions")
    if re.search(r'default:\s*', content, re.I):  # Default values
        evidence_for.append("Documents default values")

    # Check path
    path_str = str(document.path).lower()
    if '/reference/' in path_str:
        evidence_for.append("Located in /reference/ directory")

    # Evidence against being a reference
    if any('step' in h for h in h2_texts):
        evidence_against.append("Has step sections - might be guide")
    if any('prerequisites' in h for h in h2_texts):
        evidence_against.append("Has prerequisites - might be guide")
    if re.search(r'how to\b', content, re.I) and not re.search(r'specification|api', content, re.I):
        evidence_against.append("Uses instructional language - might be guide")

    # Calculate confidence
    confidence = min(0.98, len(evidence_for) * 0.12)
    if evidence_against:
        confidence -= len(evidence_against) * 0.1

    is_reference = len(evidence_for) >= 2 and confidence > 0.5

    # Identify disagreeing analysts
    disagreeing = self.identify_disagreeing_analysts(analyst_votes, 'reference')
    analysts_to_sway = {}

    for analyst_name, vote in disagreeing.items():
        if analyst_name == 'content':
            analysts_to_sway[analyst_name] = "Needs API Reference, Schema, or Specification sections"
        elif analyst_name == 'semantic':
            analysts_to_sway[analyst_name] = "Needs more technical specification language, less instructional"
        elif analyst_name == 'pattern':
            analysts_to_sway[analyst_name] = "Naming doesn't match reference conventions"
        elif analyst_name == 'metadata':
            analysts_to_sway[analyst_name] = "Frontmatter needs type: reference"

    # Identify missing signals
    missing = []
    if not any('api' in h for h in h2_texts):
        missing.append('api_reference')
    if not any('schema' in h for h in h2_texts):
        missing.append('schema')
    if not any('specification' in h or 'config' in h for h in h2_texts):
        missing.append('specification')
    if table_count < 2:
        missing.append('tables')

    return TypeAnalysis(
        is_this_type=is_reference,
        confidence=max(0, confidence),
        evidence_for=evidence_for,
        evidence_against=evidence_against,
        semantic_purpose=self.analyze_semantic_purpose(document),
        missing_signals=missing,
        recommended_changes=[],
        analysts_to_sway=analysts_to_sway,
        expert_type=self.expert_type
    )

def generate_enhancements(
    self,
    document: Document,
    analysis: TypeAnalysis
) -> List[ContentEnhancement]:
    """Generate contextual reference enhancements."""
    enhancements = []
    content = document.body or document.content
    title = document.frontmatter.get('title', 'System')

    # Generate API Reference if missing
    if 'api_reference' in analysis.missing_signals:
        api_content = self._generate_api_section(content, title)
        enhancements.append(ContentEnhancement(
            signal_type='api_reference',
            content=api_content,
            insertion_point='after_overview',
            reason="Reference docs need API documentation",
            expected_analyst_boost={'content': 0.15, 'semantic': 0.10},
            priority=1
        ))

    # Generate Schema section if missing
    if 'schema' in analysis.missing_signals:
        schema_content = self._generate_schema_section(content, title)
        enhancements.append(ContentEnhancement(
            signal_type='schema',
            content=schema_content,
            insertion_point='after_api',
            reason="Reference docs should include schema definitions",
            expected_analyst_boost={'content': 0.15, 'structural': 0.10},
            priority=1
        ))

    # Generate Specification section if missing
    if 'specification' in analysis.missing_signals:
        spec_content = self._generate_specification_section(content, title)
        enhancements.append(ContentEnhancement(
            signal_type='specification',
            content=spec_content,
            insertion_point='after_overview',
            reason="Reference docs need configuration specifications",
            expected_analyst_boost={'content': 0.15, 'pattern': 0.05},
            priority=2
        ))

    # Add tables if missing
    if 'tables' in analysis.missing_signals:
        table_content = self._generate_options_table(content, title)
        enhancements.append(ContentEnhancement(
            signal_type='tables',
            content=table_content,
            insertion_point='in_specification',
            reason="Reference docs are table-heavy for quick lookup",
            expected_analyst_boost={'content': 0.10, 'structural': 0.10},
            priority=2
        ))

    return enhancements

def _generate_api_section(self, content: str, title: str) -> str:
    """Generate contextual API Reference section."""
    # Try to infer API patterns from content
    endpoints = []

    # Look for existing endpoint patterns
    if re.search(r'/api/|endpoint|route', content, re.I):
        endpoints = [
            ('GET', '/api/v1/resource', 'List resources'),
            ('POST', '/api/v1/resource', 'Create resource'),
            ('GET', '/api/v1/resource/:id', 'Get resource by ID'),
            ('PUT', '/api/v1/resource/:id', 'Update resource'),
            ('DELETE', '/api/v1/resource/:id', 'Delete resource'),
        ]

    # Look for function patterns
    functions = re.findall(r'`(\w+)\(`', content)
    if functions:
        endpoints = [(f, 'Function', 'Description') for f in functions[:5]]

    if not endpoints:
        endpoints = [
            ('GET', '/api/resource', 'Retrieve resources'),
            ('POST', '/api/resource', 'Create new resource'),
        ]

    table_rows = "\n".join(
        f"| {m} | `{e}` | {d} |" for m, e, d in endpoints
    )

    return f"""

API Reference

Endpoints

Method	Endpoint	Description
{table_rows}

Response Format

{{
  "status": "success",
  "data": {{}}
}}

"""

def _generate_schema_section(self, content: str, title: str) -> str:
    """Generate contextual Schema section."""
    # Try to infer schema from content
    fields = []

    # Look for field patterns
    field_matches = re.findall(r'`(\w+)`:\s*(\w+)', content)
    if field_matches:
        fields = [(f, t, 'Field description') for f, t in field_matches[:5]]
    else:
        fields = [
            ('id', 'string', 'Unique identifier'),
            ('name', 'string', 'Resource name'),
            ('created_at', 'datetime', 'Creation timestamp'),
        ]

    return f"""

Schema Reference

Data Model

{title.lower().replace(' ', '_')}:
  type: object
  properties:
    id:
      type: string
      description: Unique identifier
    name:
      type: string
      description: Display name
    metadata:
      type: object
      description: Additional metadata

Field Definitions

Field	Type	Description
`id`	string	Unique identifier
`name`	string	Display name
`status`	enum	Current status
"""

def _generate_specification_section(self, content: str, title: str) -> str:
    """Generate contextual Specification section."""
    return f"""

Configuration Specification

Options

Option	Type	Default	Description
`enabled`	boolean	`true`	Enable/disable feature
`timeout`	integer	`30`	Timeout in seconds
`max_retries`	integer	`3`	Maximum retry attempts
`log_level`	string	`"info"`	Logging verbosity

Environment Variables

Variable	Required	Description
`API_KEY`	Yes	API authentication key
`BASE_URL`	No	Base URL override
"""

def _generate_options_table(self, content: str, title: str) -> str:
    """Generate options/parameters table."""
    return """

Parameters

Parameter	Type	Required	Description
`name`	string	Yes	Resource name
`type`	string	No	Resource type
`options`	object	No	Additional options
"""

API Reference​

Endpoints​

Response Format​

Schema Reference​

Data Model​

Field Definitions​

Configuration Specification​

Options​

Environment Variables​

Parameters​