Skip to main content

#!/usr/bin/env python3 """ J.20.4: Batch fix missing/poor frontmatter in document files.

Reads documents from platform.db, identifies quality issues, and patches frontmatter fields directly in the markdown files.

Usage: python3 scripts/fix-frontmatter.py --dry-run # Preview changes python3 scripts/fix-frontmatter.py # Apply fixes python3 scripts/fix-frontmatter.py --fix title # Fix only titles python3 scripts/fix-frontmatter.py --fix summary # Fix only summaries python3 scripts/fix-frontmatter.py --fix audience # Fix only audience python3 scripts/fix-frontmatter.py --fix status # Fix only status python3 scripts/fix-frontmatter.py --fix all # Fix everything

Created: 2026-02-17 Task: J.20 (Document Taxonomy System) """

import argparse import os import re import sqlite3 import sys from pathlib import Path

SCRIPT_DIR = Path(file).resolve().parent ROOT_DIR = SCRIPT_DIR.parent

Resolve platform.db

sys.path.insert(0, str(SCRIPT_DIR / "core")) try: from paths import get_context_storage_dir DB_PATH = get_context_storage_dir() / "platform.db" except ImportError: DB_PATH = Path.home() / "PROJECTS" / ".coditect-data" / "context-storage" / "platform.db"

Directories to skip (external artifacts, symlinks)

SKIP_PREFIXES = [ "analyze-new-artifacts/", "codanna/", ]

Audience inference from directory path

AUDIENCE_MAP = { "internal/": "contributor", "docs/reference/": "technical", "docs/guides/": "user", "docs/getting-started/": "user", "docs/workflows/": "technical", "docs/": "user", "coditect-core-standards/": "contributor", "templates/": "technical", "config/": "technical", "prompts/": "technical", "distribution/": "technical", "tools/": "technical", "lib/": "technical", "reports/": "contributor", }

def should_skip(path: str) -> bool: """Check if a document should be skipped.""" for prefix in SKIP_PREFIXES: if path.startswith(prefix): return True full_path = ROOT_DIR / path if full_path.is_symlink(): return True return False

def infer_audience(path: str) -> str: """Infer audience from directory path.""" for prefix, audience in AUDIENCE_MAP.items(): if path.startswith(prefix): return audience return "technical"

def extract_title_from_content(path: str) -> str: """Extract title from first heading in markdown content.""" full_path = ROOT_DIR / path if not full_path.exists(): return "" try: content = full_path.read_text(encoding='utf-8', errors='replace') # Skip past frontmatter if content.startswith('---'): end = content.find('\n---\n', 4) if end > 0: content = content[end + 5:] # Find first heading match = re.search(r'^#\s+(.+)$', content, re.MULTILINE) if match: title = match.group(1).strip() # Clean up markdown formatting title = re.sub(r'*{1,3}', '', title) title = re.sub(r'([^]+)`', r'\1', title) return title[:200] # Cap at 200 chars except (OSError, UnicodeDecodeError): pass return ""

def extract_summary_from_content(path: str) -> str: """Extract summary from first paragraph in markdown content.""" full_path = ROOT_DIR / path if not full_path.exists(): return "" try: content = full_path.read_text(encoding='utf-8', errors='replace') # Skip past frontmatter if content.startswith('---'): end = content.find('\n---\n', 4) if end > 0: content = content[end + 5:] # Skip headings and empty lines, find first paragraph lines = content.split('\n') para_lines = [] found_content = False for line in lines: stripped = line.strip() if not stripped: if found_content and para_lines: break # End of paragraph continue if stripped.startswith('#'): if found_content and para_lines: break # Hit next heading continue if stripped.startswith(('---', '```', '|', '>', '- [', '* [', '![', '<')): if para_lines: break continue found_content = True para_lines.append(stripped)

    if para_lines:
summary = ' '.join(para_lines)
# Clean markdown formatting
summary = re.sub(r'\*{1,3}([^*]+)\*{1,3}', r'\1', summary)
summary = re.sub(r'`([^`]+)`', r'\1', summary)
summary = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', summary)
# Truncate
if len(summary) > 200:
summary = summary[:197] + '...'
return summary
except (OSError, UnicodeDecodeError):
pass
return ""

def title_from_filename(path: str) -> str: """Generate a readable title from filename.""" name = Path(path).stem # Handle common patterns name = re.sub(r'^(TRACK-[A-Z])-', r'\1: ', name) name = re.sub(r'^(ADR-\d+)-', r'\1: ', name) name = re.sub(r'^(\d+)-', '', name) # Replace separators name = name.replace('-', ' ').replace('_', ' ') # Title case, preserving acronyms words = name.split() result = [] for w in words: if w.isupper() and len(w) > 1: result.append(w) # Keep acronyms else: result.append(w.capitalize()) return ' '.join(result)

def update_frontmatter(path: str, updates: dict, dry_run: bool = True) -> bool: """Update frontmatter fields in a markdown file.

Returns True if file was modified.
"""
full_path = ROOT_DIR / path
if not full_path.exists():
return False
if full_path.is_symlink():
return False

try:
content = full_path.read_text(encoding='utf-8', errors='replace')
except (OSError, UnicodeDecodeError):
return False

if not content.startswith('---'):
# No frontmatter — skip (adding frontmatter is a bigger operation)
return False

# Find frontmatter boundaries
end_idx = content.find('\n---\n', 4)
if end_idx < 0:
end_idx = content.find('\n---\r\n', 4)
if end_idx < 0:
return False

fm_text = content[4:end_idx] # Between the --- markers
body = content[end_idx + 5:] # After closing ---
modified = False

for key, value in updates.items():
# Escape single quotes in values
safe_value = str(value).replace("'", "''") if "'" in str(value) else str(value)

# Check if key exists in frontmatter
pattern = re.compile(rf'^{re.escape(key)}:\s*.*$', re.MULTILINE)
match = pattern.search(fm_text)

if match:
old_val = match.group(0).split(':', 1)[1].strip()
old_val = old_val.strip("'\"")
if old_val and old_val != '' and 'Auto-classified' not in old_val:
continue # Don't overwrite existing good values

# Replace existing key
if '\n' in safe_value or ':' in safe_value or "'" in str(value):
new_line = f"{key}: '{safe_value}'"
else:
new_line = f"{key}: {safe_value}"
fm_text = pattern.sub(new_line, fm_text, count=1)
modified = True
else:
# Add new key before the closing ---
if '\n' in safe_value or ':' in safe_value or "'" in str(value):
fm_text += f"\n{key}: '{safe_value}'"
else:
fm_text += f"\n{key}: {safe_value}"
modified = True

if not modified:
return False

new_content = f"---\n{fm_text}\n---\n{body}"

if dry_run:
return True

full_path.write_text(new_content, encoding='utf-8')
return True

def get_documents_needing_fixes(conn: sqlite3.Connection) -> list: """Query documents and their frontmatter from platform.db.""" c = conn.cursor() c.execute(''' SELECT c.id, c.path, c.category FROM components c WHERE c.type = 'document' AND c.id IN (SELECT DISTINCT component_id FROM document_frontmatter) ORDER BY c.path ''')

docs = []
for comp_id, path, category in c.fetchall():
if should_skip(path):
continue
# Get frontmatter
c.execute('SELECT key, value FROM document_frontmatter WHERE component_id = ?', (comp_id,))
fm = dict(c.fetchall())
docs.append({
'id': comp_id,
'path': path,
'category': category,
'frontmatter': fm,
})
return docs

def main(): parser = argparse.ArgumentParser(description="Batch fix document frontmatter") parser.add_argument("--dry-run", action="store_true", help="Preview changes without writing") parser.add_argument("--fix", nargs="+", default=["all"], choices=["title", "summary", "audience", "status", "keywords", "all"], help="Which fields to fix (default: all)") parser.add_argument("--db", default=str(DB_PATH), help="Path to platform.db") parser.add_argument("--verbose", "-v", action="store_true", help="Show each fix") args = parser.parse_args()

fix_all = "all" in args.fix
fix_title = fix_all or "title" in args.fix
fix_summary = fix_all or "summary" in args.fix
fix_audience = fix_all or "audience" in args.fix
fix_status = fix_all or "status" in args.fix

conn = sqlite3.connect(args.db)
docs = get_documents_needing_fixes(conn)
conn.close()

print(f"Analyzing {len(docs)} documents...")
if args.dry_run:
print("DRY RUN — no files will be modified\n")

stats = {
'title_fixed': 0,
'summary_fixed': 0,
'audience_fixed': 0,
'status_fixed': 0,
'files_modified': 0,
'files_skipped': 0,
}

for doc in docs:
path = doc['path']
fm = doc['frontmatter']
updates = {}

# Fix missing title
if fix_title:
title = fm.get('title', '')
if not title or 'Auto-classified' in title:
new_title = extract_title_from_content(path)
if not new_title:
new_title = title_from_filename(path)
if new_title:
updates['title'] = new_title
stats['title_fixed'] += 1

# Fix missing/auto summary
if fix_summary:
summary = fm.get('summary', '')
if not summary or 'Auto-classified' in summary:
new_summary = extract_summary_from_content(path)
if new_summary and len(new_summary) > 15:
updates['summary'] = new_summary
stats['summary_fixed'] += 1

# Fix missing audience
if fix_audience:
audience = fm.get('audience', '')
if not audience:
new_audience = infer_audience(path)
updates['audience'] = new_audience
stats['audience_fixed'] += 1

# Fix missing status
if fix_status:
status = fm.get('status', '')
if not status:
updates['status'] = 'active'
stats['status_fixed'] += 1

if not updates:
continue

if args.verbose or args.dry_run:
print(f" {path}")
for k, v in updates.items():
old = fm.get(k, '<missing>')
print(f" {k}: {old[:50]} -> {str(v)[:60]}")

if update_frontmatter(path, updates, dry_run=args.dry_run):
stats['files_modified'] += 1
else:
stats['files_skipped'] += 1

# Summary
print(f"\n{'DRY RUN ' if args.dry_run else ''}RESULTS:")
print(f" Files {'would be ' if args.dry_run else ''}modified: {stats['files_modified']}")
print(f" Files skipped: {stats['files_skipped']}")
print(f" Titles fixed: {stats['title_fixed']}")
print(f" Summaries fixed: {stats['summary_fixed']}")
print(f" Audience fixed: {stats['audience_fixed']}")
print(f" Status fixed: {stats['status_fixed']}")

if args.dry_run:
print("\nRun without --dry-run to apply changes.")
print("Then run: python3 scripts/component-indexer.py (to rebuild index)")
print("Then run: python3 scripts/generate-publish-manifest-db.py --stats (to regenerate manifest)")

if name == "main": main()