Skip to main content

scripts-adr-extractor

#!/usr/bin/env python3 """ CP-11: ADR Node Extractor (ADR-151)

Scans ADR markdown files (ADR-213: coditect-documentation/coditect-core/adrs/):

  • node_type: 'adr'
  • Properties: number, title, status, track, created, updated

Source: coditect-documentation/coditect-core/adrs/*.md (via paths.py get_adrs_dir()) Target: org.db kg_nodes table

Created: 2026-02-03 Track: J (Memory Intelligence) Task: J.3.4.9 """

import logging import re from pathlib import Path from typing import Any, Dict, Generator, Optional, Tuple

import yaml

from .base_extractor import BaseExtractor

logger = logging.getLogger(name)

class ADRExtractor(BaseExtractor): """ Extract ADR entities from markdown files into kg_nodes.

ADRs are scanned from coditect-documentation (ADR-213) via paths.py get_adrs_dir().
"""

# Pattern to extract ADR number from filename
ADR_FILENAME_PATTERN = re.compile(r'^ADR-(\d+)[-_](.+)\.md$', re.IGNORECASE)

def __init__(
self,
adrs_dir: Path,
target_db_path: Path,
dry_run: bool = False,
tenant_id: Optional[str] = None,
project_id: Optional[str] = None,
):
super().__init__(target_db_path, dry_run, tenant_id, project_id)
self.adrs_dir = adrs_dir

@property
def node_type(self) -> str:
return "adr"

def extract_entities(self) -> Generator[Tuple[str, str, Optional[str], Dict[str, Any], Optional[str], Optional[str]], None, None]:
"""
Extract ADRs from markdown files.

Yields:
Tuple of (node_id, name, subtype, properties, source_table, source_id)
"""
if not self.adrs_dir.exists():
logger.warning(f"ADR directory not found: {self.adrs_dir}")
return

# Find all ADR files
adr_files = list(self.adrs_dir.glob("ADR-*.md"))
logger.info(f"Found {len(adr_files)} ADR files")

for adr_file in sorted(adr_files):
try:
adr_data = self._parse_adr_file(adr_file)
if adr_data:
yield adr_data
except Exception as e:
logger.warning(f"Error parsing {adr_file.name}: {e}")
continue

def _parse_adr_file(self, adr_file: Path) -> Optional[Tuple[str, str, Optional[str], Dict[str, Any], Optional[str], Optional[str]]]:
"""
Parse an ADR markdown file.

Returns:
Tuple of (node_id, name, subtype, properties, source_table, source_id)
or None if parsing fails
"""
# Extract ADR number from filename
match = self.ADR_FILENAME_PATTERN.match(adr_file.name)
if not match:
# Try alternate patterns
alt_match = re.match(r'^ADR[-_]?(\d+)', adr_file.name, re.IGNORECASE)
if alt_match:
adr_number = int(alt_match.group(1))
title_slug = adr_file.stem.replace(f"ADR-{adr_number}-", "").replace(f"ADR_{adr_number}_", "")
else:
logger.debug(f"Skipping non-standard ADR filename: {adr_file.name}")
return None
else:
adr_number = int(match.group(1))
title_slug = match.group(2)

# Read file content
content = adr_file.read_text(encoding='utf-8')

# Parse YAML frontmatter
frontmatter = self._extract_frontmatter(content)

# Get title from frontmatter or filename
title = frontmatter.get('title', title_slug.replace('-', ' ').replace('_', ' ').title())

# Generate node_id
adr_id = f"ADR-{adr_number:03d}"
node_id = self.generate_node_id(adr_id)

# Display name
name = f"{adr_id}: {title}"

# Determine subtype from status
status = frontmatter.get('status', 'unknown').lower()
subtype = status if status in ['proposed', 'accepted', 'deprecated', 'superseded'] else 'accepted'

# Build properties
properties = {
"adr_number": adr_number,
"title": title,
"status": status,
"file_path": str(adr_file),
"file_name": adr_file.name,
}

# Add optional frontmatter fields
if 'date' in frontmatter:
properties['created'] = str(frontmatter['date'])
if 'track' in frontmatter:
properties['track'] = frontmatter['track']
if 'task_id' in frontmatter:
properties['task_id'] = frontmatter['task_id']
if 'tags' in frontmatter:
properties['tags'] = frontmatter['tags']
if 'related' in frontmatter:
properties['related_adrs'] = frontmatter['related']
if 'decision_makers' in frontmatter:
properties['decision_makers'] = frontmatter['decision_makers']

# Extract summary from first paragraph after frontmatter
summary = self._extract_summary(content)
if summary:
properties['summary'] = summary[:500] # Truncate

return (
node_id,
name,
subtype,
properties,
None, # source_table (file-based)
str(adr_file), # source_id (file path)
)

def _extract_frontmatter(self, content: str) -> Dict[str, Any]:
"""
Extract YAML frontmatter from markdown content.

Returns:
Dict of frontmatter fields, empty dict if not found
"""
# Match YAML frontmatter between --- markers
frontmatter_match = re.match(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL)
if not frontmatter_match:
return {}

try:
return yaml.safe_load(frontmatter_match.group(1)) or {}
except yaml.YAMLError as e:
logger.debug(f"YAML parse error in frontmatter: {e}")
return {}

def _extract_summary(self, content: str) -> Optional[str]:
"""
Extract summary/first paragraph from ADR content.

Looks for:
1. Executive Summary section
2. Summary section
3. First paragraph after frontmatter
"""
# Remove frontmatter
content = re.sub(r'^---\s*\n.*?\n---\s*\n', '', content, flags=re.DOTALL)

# Look for Executive Summary or Summary section
summary_match = re.search(
r'##\s*(?:Executive\s+)?Summary\s*\n+([^\n#]+(?:\n[^\n#]+)*)',
content,
re.IGNORECASE
)
if summary_match:
return summary_match.group(1).strip()

# Fall back to first non-empty paragraph
paragraphs = re.split(r'\n\s*\n', content)
for para in paragraphs:
para = para.strip()
# Skip headers and empty paragraphs
if para and not para.startswith('#') and len(para) > 50:
return para

return None