#!/usr/bin/env python3 """ J.20.4: Batch fix missing/poor frontmatter in document files.

Reads documents from platform.db, identifies quality issues, and patches frontmatter fields directly in the markdown files.

Usage: python3 scripts/fix-frontmatter.py --dry-run # Preview changes python3 scripts/fix-frontmatter.py # Apply fixes python3 scripts/fix-frontmatter.py --fix title # Fix only titles python3 scripts/fix-frontmatter.py --fix summary # Fix only summaries python3 scripts/fix-frontmatter.py --fix audience # Fix only audience python3 scripts/fix-frontmatter.py --fix status # Fix only status python3 scripts/fix-frontmatter.py --fix all # Fix everything

Created: 2026-02-17 Task: J.20 (Document Taxonomy System) """

import argparse import os import re import sqlite3 import sys from pathlib import Path

SCRIPT_DIR = Path(file).resolve().parent ROOT_DIR = SCRIPT_DIR.parent

Resolve platform.db

sys.path.insert(0, str(SCRIPT_DIR / "core")) try: from paths import get_context_storage_dir DB_PATH = get_context_storage_dir() / "platform.db" except ImportError: DB_PATH = Path.home() / "PROJECTS" / ".coditect-data" / "context-storage" / "platform.db"

Directories to skip (external artifacts, symlinks)

SKIP_PREFIXES = [ "analyze-new-artifacts/", "codanna/", ]

Audience inference from directory path

AUDIENCE_MAP = { "internal/": "contributor", "docs/reference/": "technical", "docs/guides/": "user", "docs/getting-started/": "user", "docs/workflows/": "technical", "docs/": "user", "coditect-core-standards/": "contributor", "templates/": "technical", "config/": "technical", "prompts/": "technical", "distribution/": "technical", "tools/": "technical", "lib/": "technical", "reports/": "contributor", }

def should_skip(path: str) -> bool: """Check if a document should be skipped.""" for prefix in SKIP_PREFIXES: if path.startswith(prefix): return True full_path = ROOT_DIR / path if full_path.is_symlink(): return True return False

def infer_audience(path: str) -> str: """Infer audience from directory path.""" for prefix, audience in AUDIENCE_MAP.items(): if path.startswith(prefix): return audience return "technical"

def extract_title_from_content(path: str) -> str: """Extract title from first heading in markdown content.""" full_path = ROOT_DIR / path if not full_path.exists(): return "" try: content = full_path.read_text(encoding='utf-8', errors='replace') # Skip past frontmatter if content.startswith('---'): end = content.find('\n---\n', 4) if end > 0: content = content[end + 5:] # Find first heading match = re.search(r'^#\s+(.+)$', content, re.MULTILINE) if match: title = match.group(1).strip() # Clean up markdown formatting title = re.sub(r'*{1,3}', '', title) title = re.sub(r'([^]+)`', r'\1', title) return title[:200] # Cap at 200 chars except (OSError, UnicodeDecodeError): pass return ""

def extract_summary_from_content(path: str) -> str: """Extract summary from first paragraph in markdown content.""" full_path = ROOT_DIR / path if not full_path.exists(): return "" try: content = full_path.read_text(encoding='utf-8', errors='replace') # Skip past frontmatter if content.startswith('---'): end = content.find('\n---\n', 4) if end > 0: content = content[end + 5:] # Skip headings and empty lines, find first paragraph lines = content.split('\n') para_lines = [] found_content = False for line in lines: stripped = line.strip() if not stripped: if found_content and para_lines: break # End of paragraph continue if stripped.startswith('#'): if found_content and para_lines: break # Hit next heading continue if stripped.startswith(('---', '```', '|', '>', '- [', '* [', '![', '<')): if para_lines: break continue found_content = True para_lines.append(stripped)

    if para_lines:
        summary = ' '.join(para_lines)
        # Clean markdown formatting
        summary = re.sub(r'\*{1,3}([^*]+)\*{1,3}', r'\1', summary)
        summary = re.sub(r'`([^`]+)`', r'\1', summary)
        summary = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', summary)
        # Truncate
        if len(summary) > 200:
            summary = summary[:197] + '...'
        return summary
except (OSError, UnicodeDecodeError):
    pass
return ""

def title_from_filename(path: str) -> str: """Generate a readable title from filename.""" name = Path(path).stem # Handle common patterns name = re.sub(r'^(TRACK-[A-Z])-', r'\1: ', name) name = re.sub(r'^(ADR-\d+)-', r'\1: ', name) name = re.sub(r'^(\d+)-', '', name) # Replace separators name = name.replace('-', ' ').replace('_', ' ') # Title case, preserving acronyms words = name.split() result = [] for w in words: if w.isupper() and len(w) > 1: result.append(w) # Keep acronyms else: result.append(w.capitalize()) return ' '.join(result)

def update_frontmatter(path: str, updates: dict, dry_run: bool = True) -> bool: """Update frontmatter fields in a markdown file.

Returns True if file was modified.
"""
full_path = ROOT_DIR / path
if not full_path.exists():
    return False
if full_path.is_symlink():
    return False

try:
    content = full_path.read_text(encoding='utf-8', errors='replace')
except (OSError, UnicodeDecodeError):
    return False

if not content.startswith('---'):
    # No frontmatter — skip (adding frontmatter is a bigger operation)
    return False

# Find frontmatter boundaries
end_idx = content.find('\n---\n', 4)
if end_idx < 0:
    end_idx = content.find('\n---\r\n', 4)
if end_idx < 0:
    return False

fm_text = content[4:end_idx]  # Between the --- markers
body = content[end_idx + 5:]  # After closing ---
modified = False

for key, value in updates.items():
    # Escape single quotes in values
    safe_value = str(value).replace("'", "''") if "'" in str(value) else str(value)

    # Check if key exists in frontmatter
    pattern = re.compile(rf'^{re.escape(key)}:\s*.*$', re.MULTILINE)
    match = pattern.search(fm_text)

    if match:
        old_val = match.group(0).split(':', 1)[1].strip()
        old_val = old_val.strip("'\"")
        if old_val and old_val != '' and 'Auto-classified' not in old_val:
            continue  # Don't overwrite existing good values

        # Replace existing key
        if '\n' in safe_value or ':' in safe_value or "'" in str(value):
            new_line = f"{key}: '{safe_value}'"
        else:
            new_line = f"{key}: {safe_value}"
        fm_text = pattern.sub(new_line, fm_text, count=1)
        modified = True
    else:
        # Add new key before the closing ---
        if '\n' in safe_value or ':' in safe_value or "'" in str(value):
            fm_text += f"\n{key}: '{safe_value}'"
        else:
            fm_text += f"\n{key}: {safe_value}"
        modified = True

if not modified:
    return False

new_content = f"---\n{fm_text}\n---\n{body}"

if dry_run:
    return True

full_path.write_text(new_content, encoding='utf-8')
return True

def get_documents_needing_fixes(conn: sqlite3.Connection) -> list: """Query documents and their frontmatter from platform.db.""" c = conn.cursor() c.execute(''' SELECT c.id, c.path, c.category FROM components c WHERE c.type = 'document' AND c.id IN (SELECT DISTINCT component_id FROM document_frontmatter) ORDER BY c.path ''')

docs = []
for comp_id, path, category in c.fetchall():
    if should_skip(path):
        continue
    # Get frontmatter
    c.execute('SELECT key, value FROM document_frontmatter WHERE component_id = ?', (comp_id,))
    fm = dict(c.fetchall())
    docs.append({
        'id': comp_id,
        'path': path,
        'category': category,
        'frontmatter': fm,
    })
return docs

def main(): parser = argparse.ArgumentParser(description="Batch fix document frontmatter") parser.add_argument("--dry-run", action="store_true", help="Preview changes without writing") parser.add_argument("--fix", nargs="+", default=["all"], choices=["title", "summary", "audience", "status", "keywords", "all"], help="Which fields to fix (default: all)") parser.add_argument("--db", default=str(DB_PATH), help="Path to platform.db") parser.add_argument("--verbose", "-v", action="store_true", help="Show each fix") args = parser.parse_args()

fix_all = "all" in args.fix
fix_title = fix_all or "title" in args.fix
fix_summary = fix_all or "summary" in args.fix
fix_audience = fix_all or "audience" in args.fix
fix_status = fix_all or "status" in args.fix

conn = sqlite3.connect(args.db)
docs = get_documents_needing_fixes(conn)
conn.close()

print(f"Analyzing {len(docs)} documents...")
if args.dry_run:
    print("DRY RUN — no files will be modified\n")

stats = {
    'title_fixed': 0,
    'summary_fixed': 0,
    'audience_fixed': 0,
    'status_fixed': 0,
    'files_modified': 0,
    'files_skipped': 0,
}

for doc in docs:
    path = doc['path']
    fm = doc['frontmatter']
    updates = {}

    # Fix missing title
    if fix_title:
        title = fm.get('title', '')
        if not title or 'Auto-classified' in title:
            new_title = extract_title_from_content(path)
            if not new_title:
                new_title = title_from_filename(path)
            if new_title:
                updates['title'] = new_title
                stats['title_fixed'] += 1

    # Fix missing/auto summary
    if fix_summary:
        summary = fm.get('summary', '')
        if not summary or 'Auto-classified' in summary:
            new_summary = extract_summary_from_content(path)
            if new_summary and len(new_summary) > 15:
                updates['summary'] = new_summary
                stats['summary_fixed'] += 1

    # Fix missing audience
    if fix_audience:
        audience = fm.get('audience', '')
        if not audience:
            new_audience = infer_audience(path)
            updates['audience'] = new_audience
            stats['audience_fixed'] += 1

    # Fix missing status
    if fix_status:
        status = fm.get('status', '')
        if not status:
            updates['status'] = 'active'
            stats['status_fixed'] += 1

    if not updates:
        continue

    if args.verbose or args.dry_run:
        print(f"  {path}")
        for k, v in updates.items():
            old = fm.get(k, '<missing>')
            print(f"    {k}: {old[:50]} -> {str(v)[:60]}")

    if update_frontmatter(path, updates, dry_run=args.dry_run):
        stats['files_modified'] += 1
    else:
        stats['files_skipped'] += 1

# Summary
print(f"\n{'DRY RUN ' if args.dry_run else ''}RESULTS:")
print(f"  Files {'would be ' if args.dry_run else ''}modified: {stats['files_modified']}")
print(f"  Files skipped: {stats['files_skipped']}")
print(f"  Titles fixed: {stats['title_fixed']}")
print(f"  Summaries fixed: {stats['summary_fixed']}")
print(f"  Audience fixed: {stats['audience_fixed']}")
print(f"  Status fixed: {stats['status_fixed']}")

if args.dry_run:
    print("\nRun without --dry-run to apply changes.")
    print("Then run: python3 scripts/component-indexer.py  (to rebuild index)")
    print("Then run: python3 scripts/generate-publish-manifest-db.py --stats  (to regenerate manifest)")

if name == "main": main()