scripts-reference-expert
""" Reference Type Expert
Specializes in understanding what makes a document a "reference" document. Reference docs provide technical specifications, API documentation, and lookup information.
Key characteristics:
- API endpoints and methods
- Configuration options tables
- Schema definitions
- Technical specifications
- Lookup-oriented structure (not instructional) """
import re from typing import Dict, List, Optional from pathlib import Path import sys
sys.path.insert(0, str(Path(file).parent.parent)) from core.models import Document, AnalystVote from .base import TypeExpert, TypeAnalysis, ContentEnhancement
class ReferenceExpert(TypeExpert): """Expert in identifying and enhancing reference documents."""
expert_type = "reference"
# Strong semantic indicators of a reference document
strong_indicators = [
r'api\s*reference',
r'specification',
r'schema',
r'configuration\s*options?',
r'parameters?\s*table',
r'endpoint',
r'method\s*signature',
]
# What each analyst looks for in references
analyst_expectations = {
'metadata': [
"type: reference in frontmatter",
"component_type: reference",
],
'content': [
"## API Reference section",
"## Schema section",
"## Configuration section",
"Tables with options/parameters",
],
'structural': [
"Path contains /reference/ or /docs/",
"Table-heavy structure",
"Organized by topic/component",
],
'semantic': [
"Technical specification language",
"Declarative not instructional",
"Lookup-oriented content",
],
'pattern': [
"Filename suggests reference (OVERVIEW, REFERENCE, SPEC)",
"Title is descriptive not action-oriented",
],
}
def analyze(
self,
document: Document,
analyst_votes: List[AnalystVote]
) -> TypeAnalysis:
"""Analyze if document is truly a reference document."""
content = document.body or document.content
headings = self.extract_headings(content)
h2_texts = [h[1].lower() for h in headings if h[0] == 2]
# Gather evidence
evidence_for = []
evidence_against = []
# Check strong indicators
for indicator in self.strong_indicators:
if re.search(indicator, content, re.I):
evidence_for.append(f"Contains reference indicator: '{indicator}'")
# Check for tables (references are table-heavy)
table_count = len(re.findall(r'\|.*\|.*\|', content))
if table_count >= 3:
evidence_for.append(f"Has {table_count} tables (reference docs are table-heavy)")
elif table_count >= 1:
evidence_for.append(f"Has {table_count} table(s)")
# Check for reference-style sections
ref_sections = ['api', 'schema', 'configuration', 'specification', 'overview', 'reference']
for section in ref_sections:
if any(section in h for h in h2_texts):
evidence_for.append(f"Has {section} section")
# Check for technical patterns
if re.search(r'`[A-Z_]+`', content): # Constants/enums
evidence_for.append("Contains technical constants/enums")
if re.search(r'type:\s*\w+', content): # Type definitions
evidence_for.append("Contains type definitions")
if re.search(r'default:\s*', content, re.I): # Default values
evidence_for.append("Documents default values")
# Check path
path_str = str(document.path).lower()
if '/reference/' in path_str:
evidence_for.append("Located in /reference/ directory")
# Evidence against being a reference
if any('step' in h for h in h2_texts):
evidence_against.append("Has step sections - might be guide")
if any('prerequisites' in h for h in h2_texts):
evidence_against.append("Has prerequisites - might be guide")
if re.search(r'how to\b', content, re.I) and not re.search(r'specification|api', content, re.I):
evidence_against.append("Uses instructional language - might be guide")
# Calculate confidence
confidence = min(0.98, len(evidence_for) * 0.12)
if evidence_against:
confidence -= len(evidence_against) * 0.1
is_reference = len(evidence_for) >= 2 and confidence > 0.5
# Identify disagreeing analysts
disagreeing = self.identify_disagreeing_analysts(analyst_votes, 'reference')
analysts_to_sway = {}
for analyst_name, vote in disagreeing.items():
if analyst_name == 'content':
analysts_to_sway[analyst_name] = "Needs API Reference, Schema, or Specification sections"
elif analyst_name == 'semantic':
analysts_to_sway[analyst_name] = "Needs more technical specification language, less instructional"
elif analyst_name == 'pattern':
analysts_to_sway[analyst_name] = "Naming doesn't match reference conventions"
elif analyst_name == 'metadata':
analysts_to_sway[analyst_name] = "Frontmatter needs type: reference"
# Identify missing signals
missing = []
if not any('api' in h for h in h2_texts):
missing.append('api_reference')
if not any('schema' in h for h in h2_texts):
missing.append('schema')
if not any('specification' in h or 'config' in h for h in h2_texts):
missing.append('specification')
if table_count < 2:
missing.append('tables')
return TypeAnalysis(
is_this_type=is_reference,
confidence=max(0, confidence),
evidence_for=evidence_for,
evidence_against=evidence_against,
semantic_purpose=self.analyze_semantic_purpose(document),
missing_signals=missing,
recommended_changes=[],
analysts_to_sway=analysts_to_sway,
expert_type=self.expert_type
)
def generate_enhancements(
self,
document: Document,
analysis: TypeAnalysis
) -> List[ContentEnhancement]:
"""Generate contextual reference enhancements."""
enhancements = []
content = document.body or document.content
title = document.frontmatter.get('title', 'System')
# Generate API Reference if missing
if 'api_reference' in analysis.missing_signals:
api_content = self._generate_api_section(content, title)
enhancements.append(ContentEnhancement(
signal_type='api_reference',
content=api_content,
insertion_point='after_overview',
reason="Reference docs need API documentation",
expected_analyst_boost={'content': 0.15, 'semantic': 0.10},
priority=1
))
# Generate Schema section if missing
if 'schema' in analysis.missing_signals:
schema_content = self._generate_schema_section(content, title)
enhancements.append(ContentEnhancement(
signal_type='schema',
content=schema_content,
insertion_point='after_api',
reason="Reference docs should include schema definitions",
expected_analyst_boost={'content': 0.15, 'structural': 0.10},
priority=1
))
# Generate Specification section if missing
if 'specification' in analysis.missing_signals:
spec_content = self._generate_specification_section(content, title)
enhancements.append(ContentEnhancement(
signal_type='specification',
content=spec_content,
insertion_point='after_overview',
reason="Reference docs need configuration specifications",
expected_analyst_boost={'content': 0.15, 'pattern': 0.05},
priority=2
))
# Add tables if missing
if 'tables' in analysis.missing_signals:
table_content = self._generate_options_table(content, title)
enhancements.append(ContentEnhancement(
signal_type='tables',
content=table_content,
insertion_point='in_specification',
reason="Reference docs are table-heavy for quick lookup",
expected_analyst_boost={'content': 0.10, 'structural': 0.10},
priority=2
))
return enhancements
def _generate_api_section(self, content: str, title: str) -> str:
"""Generate contextual API Reference section."""
# Try to infer API patterns from content
endpoints = []
# Look for existing endpoint patterns
if re.search(r'/api/|endpoint|route', content, re.I):
endpoints = [
('GET', '/api/v1/resource', 'List resources'),
('POST', '/api/v1/resource', 'Create resource'),
('GET', '/api/v1/resource/:id', 'Get resource by ID'),
('PUT', '/api/v1/resource/:id', 'Update resource'),
('DELETE', '/api/v1/resource/:id', 'Delete resource'),
]
# Look for function patterns
functions = re.findall(r'`(\w+)\(`', content)
if functions:
endpoints = [(f, 'Function', 'Description') for f in functions[:5]]
if not endpoints:
endpoints = [
('GET', '/api/resource', 'Retrieve resources'),
('POST', '/api/resource', 'Create new resource'),
]
table_rows = "\n".join(
f"| {m} | `{e}` | {d} |" for m, e, d in endpoints
)
return f"""
API Reference
Endpoints
| Method | Endpoint | Description |
|---|---|---|
| {table_rows} |
Response Format
{{
"status": "success",
"data": {{}}
}}
"""
def _generate_schema_section(self, content: str, title: str) -> str:
"""Generate contextual Schema section."""
# Try to infer schema from content
fields = []
# Look for field patterns
field_matches = re.findall(r'`(\w+)`:\s*(\w+)', content)
if field_matches:
fields = [(f, t, 'Field description') for f, t in field_matches[:5]]
else:
fields = [
('id', 'string', 'Unique identifier'),
('name', 'string', 'Resource name'),
('created_at', 'datetime', 'Creation timestamp'),
]
return f"""
Schema Reference
Data Model
{title.lower().replace(' ', '_')}:
type: object
properties:
id:
type: string
description: Unique identifier
name:
type: string
description: Display name
metadata:
type: object
description: Additional metadata
Field Definitions
| Field | Type | Description |
|---|---|---|
id | string | Unique identifier |
name | string | Display name |
status | enum | Current status |
| """ |
def _generate_specification_section(self, content: str, title: str) -> str:
"""Generate contextual Specification section."""
return f"""
Configuration Specification
Options
| Option | Type | Default | Description |
|---|---|---|---|
enabled | boolean | true | Enable/disable feature |
timeout | integer | 30 | Timeout in seconds |
max_retries | integer | 3 | Maximum retry attempts |
log_level | string | "info" | Logging verbosity |
Environment Variables
| Variable | Required | Description |
|---|---|---|
API_KEY | Yes | API authentication key |
BASE_URL | No | Base URL override |
| """ |
def _generate_options_table(self, content: str, title: str) -> str:
"""Generate options/parameters table."""
return """
Parameters
| Parameter | Type | Required | Description |
|---|---|---|---|
name | string | Yes | Resource name |
type | string | No | Resource type |
options | object | No | Additional options |
| """ |