#!/usr/bin/env python3 """ J.20.4: Generate publish.json from platform.db + document-taxonomy.json.

Database-backed replacement for tools/web-publishing-platform/scripts/generate-publish-manifest.js. Queries platform.db (populated by component-indexer.py) for document metadata, uses document_taxonomy for category ordering/display, and generates the same publish.json format consumed by the Document Viewer (viewer.jsx).

Usage: python3 scripts/generate-publish-manifest-db.py python3 scripts/generate-publish-manifest-db.py --project BIO-QMS --output public/publish.json python3 scripts/generate-publish-manifest-db.py --include-dashboards --stats

ADR: J.20 (Document Taxonomy System) Created: 2026-02-17 """

import argparse import json import os import re import sqlite3 import sys from datetime import datetime, timezone from pathlib import Path

SCRIPT_DIR = Path(file).resolve().parent ROOT_DIR = SCRIPT_DIR.parent

Resolve platform.db path via ADR-118 paths module

sys.path.insert(0, str(SCRIPT_DIR / "core")) try: from paths import get_context_storage_dir CONTEXT_STORAGE = get_context_storage_dir() DB_PATH = CONTEXT_STORAGE / "platform.db" except ImportError: _user_data = Path.home() / "PROJECTS" / ".coditect-data" / "context-storage" if _user_data.exists(): DB_PATH = _user_data / "platform.db" else: DB_PATH = ROOT_DIR / "context-storage" / "platform.db"

TAXONOMY_PATH = ROOT_DIR / "config" / "document-taxonomy.json"

def load_taxonomy(path: Path = TAXONOMY_PATH) -> dict: """Load document taxonomy configuration.""" if not path.exists(): print(f"WARNING: Taxonomy config not found at {path}", file=sys.stderr) return {"categories": [], "category_resolution_map": {}, "type_to_category_map": {}} with open(path) as f: return json.load(f)

def strip_markdown(md: str) -> str: """Strip markdown formatting to plain text for search indexing.

Mirrors the JS stripMarkdown() in generate-publish-manifest.js.
"""
text = md
# Remove YAML frontmatter FIRST (before --- gets stripped as horizontal rule)
text = re.sub(r'^---\n[\s\S]*?\n---\n?', '', text)
text = re.sub(r'```[\s\S]*?```', ' ', text)        # code blocks
text = re.sub(r'`[^`]+`', ' ', text)                # inline code
text = re.sub(r'!\[[^\]]*\]\([^)]*\)', '', text)    # images
text = re.sub(r'\[([^\]]*)\]\([^)]*\)', r'\1', text)  # links → keep text
text = re.sub(r'^#{1,6}\s+', '', text, flags=re.M)  # heading markers
text = re.sub(r'(\*{1,3}|_{1,3})', '', text)        # bold/italic
text = re.sub(r'^[-*+]\s+', '', text, flags=re.M)   # list markers
text = re.sub(r'^\d+\.\s+', '', text, flags=re.M)   # numbered lists
text = re.sub(r'^>\s+', '', text, flags=re.M)        # blockquotes
text = re.sub(r'---+', '', text)                     # horizontal rules
text = re.sub(r'\|', ' ', text)                      # table pipes
text = re.sub(r'\n{2,}', '\n', text)                 # collapse blank lines
return text.strip()

def title_from_filename(filename: str) -> str: """Extract human-readable title from filename.""" name = Path(filename).stem # Remove leading number prefix (e.g., "51-business-case" → "business-case") name = re.sub(r'^\d+-', '', name) # Convert kebab-case to Title Case return name.replace('-', ' ').title()

def get_taxonomy_order(taxonomy: dict) -> dict: """Build category_slug → sort_order mapping from taxonomy config.""" order = {} for cat in taxonomy.get("categories", []): order[cat["slug"]] = cat.get("sort_order", 100) return order

def get_taxonomy_display(taxonomy: dict) -> dict: """Build category_slug → display_name mapping from taxonomy config.""" display = {} for cat in taxonomy.get("categories", []): display[cat["slug"]] = cat["name"] return display

def get_taxonomy_icons(taxonomy: dict) -> dict: """Build category_slug → icon mapping from taxonomy config.""" icons = {} for cat in taxonomy.get("categories", []): icons[cat["slug"]] = cat.get("icon", "FileText") return icons

def get_taxonomy_descriptions(taxonomy: dict) -> dict: """Build category_slug → description mapping from taxonomy config.""" descs = {} for cat in taxonomy.get("categories", []): descs[cat["slug"]] = cat.get("description", "") return descs

def query_documents(conn: sqlite3.Connection, project_id: str = "default", fresh_only: bool = True) -> list: """Query all documents from platform.db with their frontmatter.

Args:
    conn: SQLite connection to platform.db
    project_id: Project scope filter
    fresh_only: If True, only include documents that have frontmatter entries
                (i.e., indexed with taxonomy-aware component-indexer). This
                filters out stale entries from previous indexer runs.
"""
cursor = conn.cursor()

if fresh_only:
    # Only documents that have frontmatter entries (taxonomy-indexed)
    cursor.execute("""
        SELECT c.id, c.name, c.type, c.category, c.path, c.description
        FROM components c
        WHERE c.type = 'document'
        AND (c.project_id = ? OR c.project_id = '' OR c.project_id IS NULL)
        AND c.id IN (SELECT DISTINCT component_id FROM document_frontmatter)
        ORDER BY c.category, c.name
    """, (project_id,))
else:
    # All documents including stale entries
    cursor.execute("""
        SELECT c.id, c.name, c.type, c.category, c.path, c.description
        FROM components c
        WHERE c.type = 'document'
        AND (c.project_id = ? OR c.project_id = '' OR c.project_id IS NULL)
        ORDER BY c.category, c.name
    """, (project_id,))

docs = []
for row in cursor.fetchall():
    comp_id, name, comp_type, category, file_path, description = row

    # Get frontmatter key-value pairs for this document
    cursor.execute("""
        SELECT key, value FROM document_frontmatter
        WHERE component_id = ?
    """, (comp_id,))
    fm = {}
    for k, v in cursor.fetchall():
        fm[k] = v

    docs.append({
        "component_id": comp_id,
        "name": name,
        "category_slug": category or "reference",
        "file_path": file_path or "",
        "description": description or "",
        "frontmatter": fm,
    })

return docs

def read_body_text(file_path: str, root_dir: Path) -> str: """Read and strip markdown body text for search indexing.""" full_path = root_dir / file_path if not full_path.exists() or not full_path.suffix == '.md': return "" try: content = full_path.read_text(encoding='utf-8', errors='replace') return strip_markdown(content) except (OSError, UnicodeDecodeError): return ""

def build_document_entry(doc: dict, taxonomy_display: dict, root_dir: Path, include_body: bool = True) -> dict: """Build a single publish.json document entry from DB record.""" fm = doc["frontmatter"] file_path = doc["file_path"] category_slug = doc["category_slug"]

# Determine display category name
category_name = taxonomy_display.get(category_slug, category_slug.replace('-', ' ').title())

# Determine document type (markdown vs dashboard)
if file_path.endswith('.jsx'):
    doc_type = "dashboard"
else:
    doc_type = "markdown"

# Build document ID from path (matches JS generator format)
doc_id = file_path.replace('/', '-').replace('\\', '-')
for ext in ('.md', '.jsx', '.tsx'):
    if doc_id.endswith(ext.replace('.', '-')):
        doc_id = doc_id[:-(len(ext) - 1)]  # remove the -ext suffix
    elif doc_id.endswith(ext):
        doc_id = doc_id[:-len(ext)]
# Clean: replace .md/.jsx in the id
doc_id = re.sub(r'\.(md|jsx|tsx)$', '', doc_id)
doc_id = doc_id.replace('.', '-')

# Title: frontmatter > filename
title = fm.get("title", "") or doc["name"] or title_from_filename(file_path)

# Keywords
keywords_raw = fm.get("keywords", "")
if keywords_raw:
    try:
        keywords = json.loads(keywords_raw)
        if not isinstance(keywords, list):
            keywords = [str(keywords)]
    except (json.JSONDecodeError, TypeError):
        keywords = [k.strip() for k in str(keywords_raw).split(',') if k.strip()]
else:
    keywords = []

# Tags (merge into keywords for search)
tags_raw = fm.get("tags", "")
if tags_raw:
    try:
        tags = json.loads(tags_raw)
        if isinstance(tags, list):
            keywords = list(set(keywords + [str(t) for t in tags]))
    except (json.JSONDecodeError, TypeError):
        pass

entry = {
    "id": doc_id,
    "title": title,
    "path": file_path,
    "type": doc_type,
    "audience": fm.get("audience", "technical"),
    "category": category_name,
    "category_slug": category_slug,
    "keywords": keywords,
    "summary": fm.get("summary", "") or doc["description"] or "",
    "author": fm.get("author", ""),
    "status": fm.get("status", "active"),
    "icon": fm.get("icon", ""),
    "sort_order": int(fm.get("sort_order", "0") or "0"),
}

# Body text for search (markdown only, optional)
if include_body and doc_type == "markdown":
    entry["body_text"] = read_body_text(file_path, root_dir)
elif doc_type == "markdown":
    entry["body_text"] = ""

return entry

def build_manifest(conn: sqlite3.Connection, taxonomy: dict, project_id: str = "default", project_name: str = "", root_dir: Path = ROOT_DIR, include_body: bool = True, include_dashboards: bool = False, fresh_only: bool = True) -> dict: """Build the complete publish.json manifest from database.""" taxonomy_display = get_taxonomy_display(taxonomy) taxonomy_order = get_taxonomy_order(taxonomy) taxonomy_icons = get_taxonomy_icons(taxonomy) taxonomy_descs = get_taxonomy_descriptions(taxonomy)

# Query documents from DB
raw_docs = query_documents(conn, project_id, fresh_only=fresh_only)

# Build document entries
documents = []
for doc in raw_docs:
    entry = build_document_entry(doc, taxonomy_display, root_dir, include_body)
    documents.append(entry)

# Optionally include JSX dashboards from filesystem (not in platform.db as documents)
if include_dashboards:
    dashboards_dir = root_dir / "dashboards"
    if dashboards_dir.exists():
        for jsx_file in sorted(dashboards_dir.rglob("*.jsx")):
            rel_path = str(jsx_file.relative_to(root_dir))
            # Check if already in documents list
            if any(d["path"] == rel_path for d in documents):
                continue
            parts = rel_path.split("/")
            cat_slug = taxonomy.get("category_resolution_map", {}).get(
                parts[1] if len(parts) > 1 else "", "analysis"
            )
            cat_name = taxonomy_display.get(cat_slug, cat_slug.replace('-', ' ').title())
            doc_id = rel_path.replace('/', '-').replace('\\', '-')
            doc_id = re.sub(r'\.jsx$', '', doc_id)

            documents.append({
                "id": doc_id,
                "title": title_from_filename(jsx_file.name),
                "path": rel_path,
                "type": "dashboard",
                "audience": "technical",
                "category": cat_name,
                "category_slug": cat_slug,
                "keywords": [],
                "summary": "",
                "author": "",
                "status": "active",
                "icon": "",
                "sort_order": 0,
            })

# Sort: by taxonomy sort_order, then alphabetically within category
documents.sort(key=lambda d: (
    taxonomy_order.get(d.get("category_slug", ""), 100),
    d.get("title", "").lower()
))

# Build category index with taxonomy metadata
categories_map = {}
for doc in documents:
    cat_slug = doc.get("category_slug", "reference")
    cat_name = doc["category"]
    if cat_slug not in categories_map:
        categories_map[cat_slug] = {
            "name": cat_name,
            "slug": cat_slug,
            "description": taxonomy_descs.get(cat_slug, ""),
            "icon": taxonomy_icons.get(cat_slug, "FileText"),
            "sort_order": taxonomy_order.get(cat_slug, 100),
            "count": 0,
            "types": {},
        }
    categories_map[cat_slug]["count"] += 1
    doc_type = doc["type"]
    categories_map[cat_slug]["types"][doc_type] = categories_map[cat_slug]["types"].get(doc_type, 0) + 1

# Sort categories by taxonomy sort_order
categories = sorted(categories_map.values(), key=lambda c: c["sort_order"])

# Remove category_slug from document entries (viewer doesn't use it)
# Keep it — it's useful for filtering and the viewer can ignore unknown fields

manifest = {
    "project_name": project_name or "CODITECT Web Publishing Platform",
    "project_id": project_id,
    "version": "2.0.0",
    "generated_at": datetime.now(timezone.utc).isoformat(),
    "generator": "generate-publish-manifest-db.py (J.20.4)",
    "taxonomy_version": taxonomy.get("version", "1.0.0"),
    "total_documents": len(documents),
    "categories": categories,
    "documents": documents,
}

return manifest

def main(): parser = argparse.ArgumentParser( description="J.20.4: Generate publish.json from platform.db + taxonomy", epilog="Replaces generate-publish-manifest.js with DB-backed generation.", ) parser.add_argument( "--output", "-o", default=str(ROOT_DIR / "tools" / "web-publishing-platform" / "public" / "publish.json"), help="Output path for publish.json (default: tools/web-publishing-platform/public/publish.json)", ) parser.add_argument( "--project", "-p", default="default", help="Project ID to filter documents (default: 'default')", ) parser.add_argument( "--project-name", default="", help="Project display name for manifest header", ) parser.add_argument( "--root", "-r", default=str(ROOT_DIR), help="Project root directory for reading markdown body text", ) parser.add_argument( "--db", default=str(DB_PATH), help=f"Path to platform.db (default: {DB_PATH})", ) parser.add_argument( "--taxonomy", default=str(TAXONOMY_PATH), help=f"Path to document-taxonomy.json (default: {TAXONOMY_PATH})", ) parser.add_argument( "--include-dashboards", action="store_true", help="Include JSX dashboards from dashboards/ directory", ) parser.add_argument( "--no-body", action="store_true", help="Skip reading markdown body text (faster, smaller output)", ) parser.add_argument( "--stats", action="store_true", help="Print category statistics after generation", ) parser.add_argument( "--dry-run", action="store_true", help="Print manifest to stdout instead of writing to file", ) parser.add_argument( "--compact", action="store_true", help="Write compact JSON (no indentation) for smaller file size", ) parser.add_argument( "--all-documents", action="store_true", help="Include all documents (including stale entries without taxonomy frontmatter)", )

args = parser.parse_args()

# Validate database exists
db_path = Path(args.db)
if not db_path.exists():
    print(f"ERROR: platform.db not found at {db_path}", file=sys.stderr)
    print("Run: python3 scripts/component-indexer.py", file=sys.stderr)
    sys.exit(1)

# Load taxonomy
taxonomy = load_taxonomy(Path(args.taxonomy))

# Connect to database
conn = sqlite3.connect(str(db_path))

# Verify document_frontmatter table exists
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='document_frontmatter'")
if not cursor.fetchone():
    print("ERROR: document_frontmatter table not found in platform.db", file=sys.stderr)
    print("Run: python3 scripts/component-indexer.py  (to rebuild with taxonomy support)", file=sys.stderr)
    conn.close()
    sys.exit(1)

# Build manifest
root_dir = Path(args.root)
manifest = build_manifest(
    conn=conn,
    taxonomy=taxonomy,
    project_id=args.project,
    project_name=args.project_name,
    root_dir=root_dir,
    include_body=not args.no_body,
    include_dashboards=args.include_dashboards,
    fresh_only=not args.all_documents,
)

conn.close()

# Output
indent = None if args.compact else 2
json_output = json.dumps(manifest, indent=indent, ensure_ascii=False)

if args.dry_run:
    print(json_output)
else:
    output_path = Path(args.output)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    output_path.write_text(json_output, encoding='utf-8')
    print(f"Generated {output_path}: {manifest['total_documents']} documents "
          f"across {len(manifest['categories'])} categories")

# Stats
if args.stats or not args.dry_run:
    print(f"\nCategory breakdown:")
    for cat in manifest["categories"]:
        types_str = ", ".join(f"{c} {t}" for t, c in cat["types"].items())
        print(f"  [{cat['sort_order']:2d}] {cat['name']:20s} {cat['count']:4d} ({types_str})")
    print(f"\n  Total: {manifest['total_documents']} documents")

    # Size info
    size_kb = len(json_output.encode('utf-8')) / 1024
    print(f"  Output size: {size_kb:.1f} KB")

if name == "main": main()