Skip to main content

#!/usr/bin/env python3 """ J.20.4: Generate publish.json from platform.db + document-taxonomy.json.

Database-backed replacement for tools/web-publishing-platform/scripts/generate-publish-manifest.js. Queries platform.db (populated by component-indexer.py) for document metadata, uses document_taxonomy for category ordering/display, and generates the same publish.json format consumed by the Document Viewer (viewer.jsx).

Usage: python3 scripts/generate-publish-manifest-db.py python3 scripts/generate-publish-manifest-db.py --project BIO-QMS --output public/publish.json python3 scripts/generate-publish-manifest-db.py --include-dashboards --stats

ADR: J.20 (Document Taxonomy System) Created: 2026-02-17 """

import argparse import json import os import re import sqlite3 import sys from datetime import datetime, timezone from pathlib import Path

SCRIPT_DIR = Path(file).resolve().parent ROOT_DIR = SCRIPT_DIR.parent

Resolve platform.db path via ADR-118 paths module

sys.path.insert(0, str(SCRIPT_DIR / "core")) try: from paths import get_context_storage_dir CONTEXT_STORAGE = get_context_storage_dir() DB_PATH = CONTEXT_STORAGE / "platform.db" except ImportError: _user_data = Path.home() / "PROJECTS" / ".coditect-data" / "context-storage" if _user_data.exists(): DB_PATH = _user_data / "platform.db" else: DB_PATH = ROOT_DIR / "context-storage" / "platform.db"

TAXONOMY_PATH = ROOT_DIR / "config" / "document-taxonomy.json"

def load_taxonomy(path: Path = TAXONOMY_PATH) -> dict: """Load document taxonomy configuration.""" if not path.exists(): print(f"WARNING: Taxonomy config not found at {path}", file=sys.stderr) return {"categories": [], "category_resolution_map": {}, "type_to_category_map": {}} with open(path) as f: return json.load(f)

def strip_markdown(md: str) -> str: """Strip markdown formatting to plain text for search indexing.

Mirrors the JS stripMarkdown() in generate-publish-manifest.js.
"""
text = md
# Remove YAML frontmatter FIRST (before --- gets stripped as horizontal rule)
text = re.sub(r'^---\n[\s\S]*?\n---\n?', '', text)
text = re.sub(r'```[\s\S]*?```', ' ', text) # code blocks
text = re.sub(r'`[^`]+`', ' ', text) # inline code
text = re.sub(r'!\[[^\]]*\]\([^)]*\)', '', text) # images
text = re.sub(r'\[([^\]]*)\]\([^)]*\)', r'\1', text) # links → keep text
text = re.sub(r'^#{1,6}\s+', '', text, flags=re.M) # heading markers
text = re.sub(r'(\*{1,3}|_{1,3})', '', text) # bold/italic
text = re.sub(r'^[-*+]\s+', '', text, flags=re.M) # list markers
text = re.sub(r'^\d+\.\s+', '', text, flags=re.M) # numbered lists
text = re.sub(r'^>\s+', '', text, flags=re.M) # blockquotes
text = re.sub(r'---+', '', text) # horizontal rules
text = re.sub(r'\|', ' ', text) # table pipes
text = re.sub(r'\n{2,}', '\n', text) # collapse blank lines
return text.strip()

def title_from_filename(filename: str) -> str: """Extract human-readable title from filename.""" name = Path(filename).stem # Remove leading number prefix (e.g., "51-business-case" → "business-case") name = re.sub(r'^\d+-', '', name) # Convert kebab-case to Title Case return name.replace('-', ' ').title()

def get_taxonomy_order(taxonomy: dict) -> dict: """Build category_slug → sort_order mapping from taxonomy config.""" order = {} for cat in taxonomy.get("categories", []): order[cat["slug"]] = cat.get("sort_order", 100) return order

def get_taxonomy_display(taxonomy: dict) -> dict: """Build category_slug → display_name mapping from taxonomy config.""" display = {} for cat in taxonomy.get("categories", []): display[cat["slug"]] = cat["name"] return display

def get_taxonomy_icons(taxonomy: dict) -> dict: """Build category_slug → icon mapping from taxonomy config.""" icons = {} for cat in taxonomy.get("categories", []): icons[cat["slug"]] = cat.get("icon", "FileText") return icons

def get_taxonomy_descriptions(taxonomy: dict) -> dict: """Build category_slug → description mapping from taxonomy config.""" descs = {} for cat in taxonomy.get("categories", []): descs[cat["slug"]] = cat.get("description", "") return descs

def query_documents(conn: sqlite3.Connection, project_id: str = "default", fresh_only: bool = True) -> list: """Query all documents from platform.db with their frontmatter.

Args:
conn: SQLite connection to platform.db
project_id: Project scope filter
fresh_only: If True, only include documents that have frontmatter entries
(i.e., indexed with taxonomy-aware component-indexer). This
filters out stale entries from previous indexer runs.
"""
cursor = conn.cursor()

if fresh_only:
# Only documents that have frontmatter entries (taxonomy-indexed)
cursor.execute("""
SELECT c.id, c.name, c.type, c.category, c.path, c.description
FROM components c
WHERE c.type = 'document'
AND (c.project_id = ? OR c.project_id = '' OR c.project_id IS NULL)
AND c.id IN (SELECT DISTINCT component_id FROM document_frontmatter)
ORDER BY c.category, c.name
""", (project_id,))
else:
# All documents including stale entries
cursor.execute("""
SELECT c.id, c.name, c.type, c.category, c.path, c.description
FROM components c
WHERE c.type = 'document'
AND (c.project_id = ? OR c.project_id = '' OR c.project_id IS NULL)
ORDER BY c.category, c.name
""", (project_id,))

docs = []
for row in cursor.fetchall():
comp_id, name, comp_type, category, file_path, description = row

# Get frontmatter key-value pairs for this document
cursor.execute("""
SELECT key, value FROM document_frontmatter
WHERE component_id = ?
""", (comp_id,))
fm = {}
for k, v in cursor.fetchall():
fm[k] = v

docs.append({
"component_id": comp_id,
"name": name,
"category_slug": category or "reference",
"file_path": file_path or "",
"description": description or "",
"frontmatter": fm,
})

return docs

def read_body_text(file_path: str, root_dir: Path) -> str: """Read and strip markdown body text for search indexing.""" full_path = root_dir / file_path if not full_path.exists() or not full_path.suffix == '.md': return "" try: content = full_path.read_text(encoding='utf-8', errors='replace') return strip_markdown(content) except (OSError, UnicodeDecodeError): return ""

def build_document_entry(doc: dict, taxonomy_display: dict, root_dir: Path, include_body: bool = True) -> dict: """Build a single publish.json document entry from DB record.""" fm = doc["frontmatter"] file_path = doc["file_path"] category_slug = doc["category_slug"]

# Determine display category name
category_name = taxonomy_display.get(category_slug, category_slug.replace('-', ' ').title())

# Determine document type (markdown vs dashboard)
if file_path.endswith('.jsx'):
doc_type = "dashboard"
else:
doc_type = "markdown"

# Build document ID from path (matches JS generator format)
doc_id = file_path.replace('/', '-').replace('\\', '-')
for ext in ('.md', '.jsx', '.tsx'):
if doc_id.endswith(ext.replace('.', '-')):
doc_id = doc_id[:-(len(ext) - 1)] # remove the -ext suffix
elif doc_id.endswith(ext):
doc_id = doc_id[:-len(ext)]
# Clean: replace .md/.jsx in the id
doc_id = re.sub(r'\.(md|jsx|tsx)$', '', doc_id)
doc_id = doc_id.replace('.', '-')

# Title: frontmatter > filename
title = fm.get("title", "") or doc["name"] or title_from_filename(file_path)

# Keywords
keywords_raw = fm.get("keywords", "")
if keywords_raw:
try:
keywords = json.loads(keywords_raw)
if not isinstance(keywords, list):
keywords = [str(keywords)]
except (json.JSONDecodeError, TypeError):
keywords = [k.strip() for k in str(keywords_raw).split(',') if k.strip()]
else:
keywords = []

# Tags (merge into keywords for search)
tags_raw = fm.get("tags", "")
if tags_raw:
try:
tags = json.loads(tags_raw)
if isinstance(tags, list):
keywords = list(set(keywords + [str(t) for t in tags]))
except (json.JSONDecodeError, TypeError):
pass

entry = {
"id": doc_id,
"title": title,
"path": file_path,
"type": doc_type,
"audience": fm.get("audience", "technical"),
"category": category_name,
"category_slug": category_slug,
"keywords": keywords,
"summary": fm.get("summary", "") or doc["description"] or "",
"author": fm.get("author", ""),
"status": fm.get("status", "active"),
"icon": fm.get("icon", ""),
"sort_order": int(fm.get("sort_order", "0") or "0"),
}

# Body text for search (markdown only, optional)
if include_body and doc_type == "markdown":
entry["body_text"] = read_body_text(file_path, root_dir)
elif doc_type == "markdown":
entry["body_text"] = ""

return entry

def build_manifest(conn: sqlite3.Connection, taxonomy: dict, project_id: str = "default", project_name: str = "", root_dir: Path = ROOT_DIR, include_body: bool = True, include_dashboards: bool = False, fresh_only: bool = True) -> dict: """Build the complete publish.json manifest from database.""" taxonomy_display = get_taxonomy_display(taxonomy) taxonomy_order = get_taxonomy_order(taxonomy) taxonomy_icons = get_taxonomy_icons(taxonomy) taxonomy_descs = get_taxonomy_descriptions(taxonomy)

# Query documents from DB
raw_docs = query_documents(conn, project_id, fresh_only=fresh_only)

# Build document entries
documents = []
for doc in raw_docs:
entry = build_document_entry(doc, taxonomy_display, root_dir, include_body)
documents.append(entry)

# Optionally include JSX dashboards from filesystem (not in platform.db as documents)
if include_dashboards:
dashboards_dir = root_dir / "dashboards"
if dashboards_dir.exists():
for jsx_file in sorted(dashboards_dir.rglob("*.jsx")):
rel_path = str(jsx_file.relative_to(root_dir))
# Check if already in documents list
if any(d["path"] == rel_path for d in documents):
continue
parts = rel_path.split("/")
cat_slug = taxonomy.get("category_resolution_map", {}).get(
parts[1] if len(parts) > 1 else "", "analysis"
)
cat_name = taxonomy_display.get(cat_slug, cat_slug.replace('-', ' ').title())
doc_id = rel_path.replace('/', '-').replace('\\', '-')
doc_id = re.sub(r'\.jsx$', '', doc_id)

documents.append({
"id": doc_id,
"title": title_from_filename(jsx_file.name),
"path": rel_path,
"type": "dashboard",
"audience": "technical",
"category": cat_name,
"category_slug": cat_slug,
"keywords": [],
"summary": "",
"author": "",
"status": "active",
"icon": "",
"sort_order": 0,
})

# Sort: by taxonomy sort_order, then alphabetically within category
documents.sort(key=lambda d: (
taxonomy_order.get(d.get("category_slug", ""), 100),
d.get("title", "").lower()
))

# Build category index with taxonomy metadata
categories_map = {}
for doc in documents:
cat_slug = doc.get("category_slug", "reference")
cat_name = doc["category"]
if cat_slug not in categories_map:
categories_map[cat_slug] = {
"name": cat_name,
"slug": cat_slug,
"description": taxonomy_descs.get(cat_slug, ""),
"icon": taxonomy_icons.get(cat_slug, "FileText"),
"sort_order": taxonomy_order.get(cat_slug, 100),
"count": 0,
"types": {},
}
categories_map[cat_slug]["count"] += 1
doc_type = doc["type"]
categories_map[cat_slug]["types"][doc_type] = categories_map[cat_slug]["types"].get(doc_type, 0) + 1

# Sort categories by taxonomy sort_order
categories = sorted(categories_map.values(), key=lambda c: c["sort_order"])

# Remove category_slug from document entries (viewer doesn't use it)
# Keep it — it's useful for filtering and the viewer can ignore unknown fields

manifest = {
"project_name": project_name or "CODITECT Web Publishing Platform",
"project_id": project_id,
"version": "2.0.0",
"generated_at": datetime.now(timezone.utc).isoformat(),
"generator": "generate-publish-manifest-db.py (J.20.4)",
"taxonomy_version": taxonomy.get("version", "1.0.0"),
"total_documents": len(documents),
"categories": categories,
"documents": documents,
}

return manifest

def main(): parser = argparse.ArgumentParser( description="J.20.4: Generate publish.json from platform.db + taxonomy", epilog="Replaces generate-publish-manifest.js with DB-backed generation.", ) parser.add_argument( "--output", "-o", default=str(ROOT_DIR / "tools" / "web-publishing-platform" / "public" / "publish.json"), help="Output path for publish.json (default: tools/web-publishing-platform/public/publish.json)", ) parser.add_argument( "--project", "-p", default="default", help="Project ID to filter documents (default: 'default')", ) parser.add_argument( "--project-name", default="", help="Project display name for manifest header", ) parser.add_argument( "--root", "-r", default=str(ROOT_DIR), help="Project root directory for reading markdown body text", ) parser.add_argument( "--db", default=str(DB_PATH), help=f"Path to platform.db (default: {DB_PATH})", ) parser.add_argument( "--taxonomy", default=str(TAXONOMY_PATH), help=f"Path to document-taxonomy.json (default: {TAXONOMY_PATH})", ) parser.add_argument( "--include-dashboards", action="store_true", help="Include JSX dashboards from dashboards/ directory", ) parser.add_argument( "--no-body", action="store_true", help="Skip reading markdown body text (faster, smaller output)", ) parser.add_argument( "--stats", action="store_true", help="Print category statistics after generation", ) parser.add_argument( "--dry-run", action="store_true", help="Print manifest to stdout instead of writing to file", ) parser.add_argument( "--compact", action="store_true", help="Write compact JSON (no indentation) for smaller file size", ) parser.add_argument( "--all-documents", action="store_true", help="Include all documents (including stale entries without taxonomy frontmatter)", )

args = parser.parse_args()

# Validate database exists
db_path = Path(args.db)
if not db_path.exists():
print(f"ERROR: platform.db not found at {db_path}", file=sys.stderr)
print("Run: python3 scripts/component-indexer.py", file=sys.stderr)
sys.exit(1)

# Load taxonomy
taxonomy = load_taxonomy(Path(args.taxonomy))

# Connect to database
conn = sqlite3.connect(str(db_path))

# Verify document_frontmatter table exists
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='document_frontmatter'")
if not cursor.fetchone():
print("ERROR: document_frontmatter table not found in platform.db", file=sys.stderr)
print("Run: python3 scripts/component-indexer.py (to rebuild with taxonomy support)", file=sys.stderr)
conn.close()
sys.exit(1)

# Build manifest
root_dir = Path(args.root)
manifest = build_manifest(
conn=conn,
taxonomy=taxonomy,
project_id=args.project,
project_name=args.project_name,
root_dir=root_dir,
include_body=not args.no_body,
include_dashboards=args.include_dashboards,
fresh_only=not args.all_documents,
)

conn.close()

# Output
indent = None if args.compact else 2
json_output = json.dumps(manifest, indent=indent, ensure_ascii=False)

if args.dry_run:
print(json_output)
else:
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(json_output, encoding='utf-8')
print(f"Generated {output_path}: {manifest['total_documents']} documents "
f"across {len(manifest['categories'])} categories")

# Stats
if args.stats or not args.dry_run:
print(f"\nCategory breakdown:")
for cat in manifest["categories"]:
types_str = ", ".join(f"{c} {t}" for t, c in cat["types"].items())
print(f" [{cat['sort_order']:2d}] {cat['name']:20s} {cat['count']:4d} ({types_str})")
print(f"\n Total: {manifest['total_documents']} documents")

# Size info
size_kb = len(json_output.encode('utf-8')) / 1024
print(f" Output size: {size_kb:.1f} KB")

if name == "main": main()