scripts-adr-extractor
#!/usr/bin/env python3 """ CP-11: ADR Node Extractor (ADR-151)
Scans ADR markdown files (ADR-213: coditect-documentation/coditect-core/adrs/):
- node_type: 'adr'
- Properties: number, title, status, track, created, updated
Source: coditect-documentation/coditect-core/adrs/*.md (via paths.py get_adrs_dir()) Target: org.db kg_nodes table
Created: 2026-02-03 Track: J (Memory Intelligence) Task: J.3.4.9 """
import logging import re from pathlib import Path from typing import Any, Dict, Generator, Optional, Tuple
import yaml
from .base_extractor import BaseExtractor
logger = logging.getLogger(name)
class ADRExtractor(BaseExtractor): """ Extract ADR entities from markdown files into kg_nodes.
ADRs are scanned from coditect-documentation (ADR-213) via paths.py get_adrs_dir().
"""
# Pattern to extract ADR number from filename
ADR_FILENAME_PATTERN = re.compile(r'^ADR-(\d+)[-_](.+)\.md$', re.IGNORECASE)
def __init__(
self,
adrs_dir: Path,
target_db_path: Path,
dry_run: bool = False,
tenant_id: Optional[str] = None,
project_id: Optional[str] = None,
):
super().__init__(target_db_path, dry_run, tenant_id, project_id)
self.adrs_dir = adrs_dir
@property
def node_type(self) -> str:
return "adr"
def extract_entities(self) -> Generator[Tuple[str, str, Optional[str], Dict[str, Any], Optional[str], Optional[str]], None, None]:
"""
Extract ADRs from markdown files.
Yields:
Tuple of (node_id, name, subtype, properties, source_table, source_id)
"""
if not self.adrs_dir.exists():
logger.warning(f"ADR directory not found: {self.adrs_dir}")
return
# Find all ADR files
adr_files = list(self.adrs_dir.glob("ADR-*.md"))
logger.info(f"Found {len(adr_files)} ADR files")
for adr_file in sorted(adr_files):
try:
adr_data = self._parse_adr_file(adr_file)
if adr_data:
yield adr_data
except Exception as e:
logger.warning(f"Error parsing {adr_file.name}: {e}")
continue
def _parse_adr_file(self, adr_file: Path) -> Optional[Tuple[str, str, Optional[str], Dict[str, Any], Optional[str], Optional[str]]]:
"""
Parse an ADR markdown file.
Returns:
Tuple of (node_id, name, subtype, properties, source_table, source_id)
or None if parsing fails
"""
# Extract ADR number from filename
match = self.ADR_FILENAME_PATTERN.match(adr_file.name)
if not match:
# Try alternate patterns
alt_match = re.match(r'^ADR[-_]?(\d+)', adr_file.name, re.IGNORECASE)
if alt_match:
adr_number = int(alt_match.group(1))
title_slug = adr_file.stem.replace(f"ADR-{adr_number}-", "").replace(f"ADR_{adr_number}_", "")
else:
logger.debug(f"Skipping non-standard ADR filename: {adr_file.name}")
return None
else:
adr_number = int(match.group(1))
title_slug = match.group(2)
# Read file content
content = adr_file.read_text(encoding='utf-8')
# Parse YAML frontmatter
frontmatter = self._extract_frontmatter(content)
# Get title from frontmatter or filename
title = frontmatter.get('title', title_slug.replace('-', ' ').replace('_', ' ').title())
# Generate node_id
adr_id = f"ADR-{adr_number:03d}"
node_id = self.generate_node_id(adr_id)
# Display name
name = f"{adr_id}: {title}"
# Determine subtype from status
status = frontmatter.get('status', 'unknown').lower()
subtype = status if status in ['proposed', 'accepted', 'deprecated', 'superseded'] else 'accepted'
# Build properties
properties = {
"adr_number": adr_number,
"title": title,
"status": status,
"file_path": str(adr_file),
"file_name": adr_file.name,
}
# Add optional frontmatter fields
if 'date' in frontmatter:
properties['created'] = str(frontmatter['date'])
if 'track' in frontmatter:
properties['track'] = frontmatter['track']
if 'task_id' in frontmatter:
properties['task_id'] = frontmatter['task_id']
if 'tags' in frontmatter:
properties['tags'] = frontmatter['tags']
if 'related' in frontmatter:
properties['related_adrs'] = frontmatter['related']
if 'decision_makers' in frontmatter:
properties['decision_makers'] = frontmatter['decision_makers']
# Extract summary from first paragraph after frontmatter
summary = self._extract_summary(content)
if summary:
properties['summary'] = summary[:500] # Truncate
return (
node_id,
name,
subtype,
properties,
None, # source_table (file-based)
str(adr_file), # source_id (file path)
)
def _extract_frontmatter(self, content: str) -> Dict[str, Any]:
"""
Extract YAML frontmatter from markdown content.
Returns:
Dict of frontmatter fields, empty dict if not found
"""
# Match YAML frontmatter between --- markers
frontmatter_match = re.match(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL)
if not frontmatter_match:
return {}
try:
return yaml.safe_load(frontmatter_match.group(1)) or {}
except yaml.YAMLError as e:
logger.debug(f"YAML parse error in frontmatter: {e}")
return {}
def _extract_summary(self, content: str) -> Optional[str]:
"""
Extract summary/first paragraph from ADR content.
Looks for:
1. Executive Summary section
2. Summary section
3. First paragraph after frontmatter
"""
# Remove frontmatter
content = re.sub(r'^---\s*\n.*?\n---\s*\n', '', content, flags=re.DOTALL)
# Look for Executive Summary or Summary section
summary_match = re.search(
r'##\s*(?:Executive\s+)?Summary\s*\n+([^\n#]+(?:\n[^\n#]+)*)',
content,
re.IGNORECASE
)
if summary_match:
return summary_match.group(1).strip()
# Fall back to first non-empty paragraph
paragraphs = re.split(r'\n\s*\n', content)
for para in paragraphs:
para = para.strip()
# Skip headers and empty paragraphs
if para and not para.startswith('#') and len(para) > 50:
return para
return None