scripts-index-expert
""" Index Expert - Type Expert for Index/Catalog Documents
Identifies index files - documents that catalog, list, or provide navigation to other documents or components in a collection.
Key signals:
- INDEX in filename
- Lists of links/references
- Table of contents structure
- Inventory/catalog patterns
- Navigation structure """
import re from typing import Dict, List from pathlib import Path
import sys sys.path.insert(0, str(Path(file).parent.parent)) from core.models import Document, AnalystVote
from .base import TypeExpert, TypeAnalysis, ContentEnhancement
class IndexExpert(TypeExpert): """Expert for identifying index/catalog documents."""
EXPERT_TYPE = 'index'
# Index-specific sections
INDEX_SECTIONS = [
'table of contents',
'contents',
'index',
'catalog',
'inventory',
'directory',
'listing',
'navigation',
'components',
'available',
'list of',
]
# Index patterns
INDEX_PATTERNS = [
r'\[.*?\]\(.*?\.md\)', # Markdown links to .md files
r'^\s*[-*]\s+\[', # Bulleted lists with links
r'^\s*\d+\.\s+\[', # Numbered lists with links
r'\|\s*\[.*?\]\(.*?\)\s*\|', # Table with links
r'see\s+also',
r'related\s+(documents?|files?)',
]
def analyze(self, document: Document, analyst_votes: List[AnalystVote]) -> TypeAnalysis:
"""Analyze if document is an index/catalog."""
content = document.content
content_lower = content.lower()
evidence_for = []
evidence_against = []
missing_signals = []
# Check filename
filename = Path(document.path).stem.upper()
is_index_file = 'INDEX' in filename or 'CATALOG' in filename or 'INVENTORY' in filename
if is_index_file:
evidence_for.append(f"Filename indicates index: {Path(document.path).name}")
# Check for index sections
section_count = 0
for section in self.INDEX_SECTIONS:
if re.search(rf'#+\s*{section}', content_lower):
section_count += 1
if section_count <= 3:
evidence_for.append(f"Has index section: '{section}'")
# Count markdown links (high density = likely index)
md_links = re.findall(r'\[([^\]]+)\]\(([^)]+)\)', content)
link_count = len(md_links)
if link_count > 10:
evidence_for.append(f"High link density: {link_count} links")
elif link_count > 5:
evidence_for.append(f"Moderate link density: {link_count} links")
# Check for links to .md files specifically
md_file_links = [l for l in md_links if l[1].endswith('.md')]
if len(md_file_links) > 5:
evidence_for.append(f"Links to {len(md_file_links)} markdown files")
# Check for bulleted/numbered lists with links
list_links = len(re.findall(r'^\s*[-*\d.]+\s+\[', content, re.MULTILINE))
if list_links > 5:
evidence_for.append(f"Has {list_links} list items with links")
# Check for table of contents pattern
if re.search(r'^\s*[-*]\s+\[.*?\]\(#', content, re.MULTILINE):
evidence_for.append("Has table of contents with anchor links")
# Check for inventory-style tables
table_rows = len(re.findall(r'^\|.*\|$', content, re.MULTILINE))
if table_rows > 5 and link_count > 3:
evidence_for.append(f"Has inventory table with {table_rows} rows")
# Evidence against
if re.search(r'#+\s*(step\s+\d|how\s+to|tutorial)', content_lower):
evidence_against.append("Has tutorial/guide sections")
if re.search(r'```[\w]*\n[\s\S]{200,}?\n```', content):
evidence_against.append("Has substantial code blocks - might be guide/reference")
if link_count < 3:
evidence_against.append("Low link count for an index")
# Missing signals
if link_count < 5:
missing_signals.append('navigation_links')
if section_count < 1:
missing_signals.append('index_sections')
if not re.search(r'#+\s*(overview|about|description)', content_lower):
missing_signals.append('overview')
# Calculate confidence
confidence = self._calculate_confidence(
is_index_file, evidence_for, evidence_against,
link_count, section_count
)
is_index = confidence > 0.6 or (is_index_file and confidence > 0.4)
# Determine which analysts to sway
analysts_to_sway = {}
for vote in analyst_votes:
if vote.classification != 'index' and is_index:
analysts_to_sway[vote.agent] = f"Document is index, not {vote.classification}"
return TypeAnalysis(
is_this_type=is_index,
confidence=confidence,
evidence_for=evidence_for,
evidence_against=evidence_against,
semantic_purpose="Catalog and navigate to related documents" if is_index else "Unknown",
missing_signals=missing_signals,
recommended_changes=[],
analysts_to_sway=analysts_to_sway,
expert_type=self.EXPERT_TYPE
)
def _calculate_confidence(
self,
is_index_file: bool,
evidence_for: List[str],
evidence_against: List[str],
link_count: int,
section_count: int
) -> float:
"""Calculate confidence score."""
# Filename is strong signal
base = 0.6 if is_index_file else 0.15
# Link density is key
base += min(0.25, link_count * 0.02)
# Sections help
base += min(0.1, section_count * 0.05)
# Other evidence
base += min(0.1, len(evidence_for) * 0.02)
# Subtract for counter-evidence
base -= len(evidence_against) * 0.12
return max(0.0, min(0.98, base))
def generate_enhancements(
self,
document: Document,
analysis: TypeAnalysis
) -> List[ContentEnhancement]:
"""Generate enhancements for index documents."""
enhancements = []
for signal in analysis.missing_signals:
if signal == 'navigation_links':
enhancements.append(ContentEnhancement(
signal_type='navigation_links',
content=self._generate_navigation_section(),
insertion_point='after_overview',
reason='Index documents need navigation links to other docs',
expected_analyst_boost={'pattern': 0.2, 'content': 0.15},
priority=1
))
elif signal == 'index_sections':
enhancements.append(ContentEnhancement(
signal_type='index_sections',
content=self._generate_index_structure(),
insertion_point='after_title',
reason='Index documents need catalog/directory sections',
expected_analyst_boost={'structural': 0.15, 'content': 0.1},
priority=1
))
elif signal == 'overview':
enhancements.append(ContentEnhancement(
signal_type='overview',
content=self._generate_overview(),
insertion_point='after_title',
reason='Index documents should have brief overview',
expected_analyst_boost={'content': 0.1, 'semantic': 0.05},
priority=2
))
return enhancements
def _generate_navigation_section(self) -> str:
"""Generate navigation section."""
return '''## Quick Navigation
| Document | Description |
|---|---|
| Document 1 | Description of document 1 |
| Document 2 | Description of document 2 |
| Document 3 | Description of document 3 |
'''
def _generate_index_structure(self) -> str:
"""Generate index structure."""
return '''## Contents
Category 1
Category 2
'''
def _generate_overview(self) -> str:
"""Generate overview section."""
return '''## Overview
This index provides navigation to all documents in this collection. Use the sections below to find what you need.
'''