#!/usr/bin/env python3 """ Generate YAML manifests for all research staging directories per ADR-207.

This script scans analyze-new-artifacts/ directories and generates manifests conforming to config/schemas/research-manifest-v1.schema.json.

Author: Claude (Sonnet 4.5) Created: 2026-02-16 """

import os import sys from pathlib import Path from datetime import datetime from typing import Dict, List, Tuple

Categorization from ADR-207

CATEGORIZATION = { # technology-evaluation (18) "coditect-abacus.ai-research-analyisis": "technology-evaluation", "coditect-agent-zero-research": "technology-evaluation", "CODITECT-claude-code-eval-loops": "technology-evaluation", "CODITECT-clawdrop-skils-research": "technology-evaluation", "coditect-codex.research": "technology-evaluation", "coditect-copilotkit-research": "technology-evaluation", "CODITECT-docker-registry-research": "technology-evaluation", "CODITECT-docusaurus-search-research": "technology-evaluation", "coditect-gemini-api-context-url": "technology-evaluation", "coditect-google-cloud-workstation-research": "technology-evaluation", "CODITECT-kimi-2.5-research": "technology-evaluation", "coditect-microsoft-fabric-research": "technology-evaluation", "coditect-motia-backend-research": "technology-evaluation", "coditect-openclaw-research": "technology-evaluation", "CODITECT-OPENCODE-RESEARCH": "technology-evaluation", "coditect-paperbanana-research": "technology-evaluation", "coditect-plugins-research": "technology-evaluation", "CODITECT-unified-studio": "technology-evaluation", "coditect-video-content-extraction-research": "technology-evaluation",

# academic (13)
"coditect-agentic-paradigms-for-llm‑enabled-healthcare-communication": "academic",
"coditect-ai-applicability-to-occupations": "academic",
"coditect-ai-guardrails-research": "academic",
"coditect-ai-research-pdfs": "academic",
"coditect-ambiguity-and-intent-research": "academic",
"coditect-anthropic-constitution": "academic",
"CODITECT-parallel‑agent-reinforcement-learning-research": "academic",
"coditect-prompt-repetition-research": "academic",
"coditect-recursive-large-language-models": "academic",
"coditect-scaling-agent-systems": "academic",
"coditect-society-of-mind-minsky": "academic",
"coditect-universal-document-object-model": "academic",
"CODITECT-yann-lecun-research": "academic",

# competitive-intelligence (16)
"coditect-a16z-youtube-analysis-research": "competitive-intelligence",
"coditect-agentiic-enterprise-systems": "competitive-intelligence",
"CODITECT-AI-HOUSE-DAVOS-TRANSCRIPTS": "competitive-intelligence",
"coditect-all-in-podcast-transcripts": "competitive-intelligence",
"coditect-anthropic-cowork-impact-competitive-impact": "competitive-intelligence",
"coditect-davos-economic-forum-2026": "competitive-intelligence",
"CODITECT-dylan-davis-research": "competitive-intelligence",
"coditect-eric-schmidt-ai": "competitive-intelligence",
"coditect-ibm-ai-research": "competitive-intelligence",
"coditect-lex-clips-podcast-transcripts": "competitive-intelligence",
"coditect-mckinsey-ai-hype-cycle-research": "competitive-intelligence",
"coditect-mckinsey-research": "competitive-intelligence",
"CODITECT-moltbot-research": "competitive-intelligence",
"coditect-nate-b-jones-youtube-transcript-analysis": "competitive-intelligence",
"CODITECT-palantir-research": "competitive-intelligence",
"coditect-third-golden-age-of-software": "competitive-intelligence",

# business-market (14)
"CODITECT-Avivatec-Project": "business-market",
"coditect-avivatect-fp-and-a-research": "business-market",
"CODITECT-az1-entity-analysis": "business-market",
"coditect-canada-entrepreneurs": "business-market",
"coditect-financial-model-2026-02-04": "business-market",
"CODITECT-founders-handbook": "business-market",
"CODITECT-hi-tech-ventures-research": "business-market",
"coditect-legal-contracts": "business-market",
"coditect-product-market-fit": "business-market",
"coditect-roblox-market-research": "business-market",
"CODITECT-RUNWAY-mentor-deck-research": "business-market",
"coditect-tiny-seed-playbook": "business-market",
"coditect-use-cases": "business-market",
"coditect-value-proposition": "business-market",

# domain (6)
"coditect-ai-risk-management-framework": "domain",
"coditect-bioscience-workorders-research": "domain",
"coditect-c3pao-accreditation-research": "domain",
"CODITECT-LIMS-sample-receiving-process": "domain",
"coditect-regulatory-frameworks-research": "domain",
"coditect-zero-trust-security-agentic-ai-research": "domain",

# process-internal (28)
"CODITECT-ambiguity-and-intent-research-DUPLICATE": "process-internal",
"CODITECT-analysis-workspace": "process-internal",
"CODITECT-analyze-ui-images": "process-internal",
"coditect-bookmarks-buku": "process-internal",
"coditect-browser-analysis": "process-internal",
"coditect-browser-screenshots": "process-internal",
"CODITECT-consequence-aware-automous-execution": "process-internal",
"coditect-conflict-avoidance-strategies": "process-internal",
"coditect-context-graph-research": "process-internal",
"coditect-continual-learning-analysis": "process-internal",
"coditect-core-process-framework": "process-internal",
"coditect-decision-rights-and-responsibility-frameworks": "process-internal",
"coditect-internal-meeting-note-analysis": "process-internal",
"coditect-license-management": "process-internal",
"coditect-master-prompt-research": "process-internal",
"coditect-method-for-analysis-large-file-sets": "process-internal",
"CODITECT-moe-judges-research": "process-internal",
"coditect-new-installation-errors-troubleshoot": "process-internal",
"coditect-new-project-ai-initiation-process": "process-internal",
"coditect-prd-user-intention-research": "process-internal",
"coditect-profile-images": "process-internal",
"coditect-ralph-wiggum-technique": "process-internal",
"CODITECT-screenshots-for-analysis": "process-internal",
"coditect-second-brain-research": "process-internal",
"CODITECT-SKILLS-cross-functional-llm-research": "process-internal",
"coditect-system-prompt": "process-internal",
"coditect-udom-research-autonomous-orchestration": "process-internal",
"coditect-ui-ux-agent-design-research": "process-internal",

}

def strip_prefix(dirname: str) -> str: """Strip coditect- or CODITECT- prefix, lowercase for topic.""" topic = dirname if topic.lower().startswith("coditect-"): topic = topic[9:] elif topic.lower().startswith("coditect"): topic = topic[8:] return topic.lower().strip("-")

def detect_input_type(dir_path: Path) -> str: """Detect input source type from file extensions.""" extensions = set() for item in dir_path.iterdir(): if item.is_file() and not item.name.startswith("."): ext = item.suffix.lower() if ext in [".pdf", ".txt", ".md", ".docx", ".mp4", ".json"]: extensions.add(ext)

if ".pdf" in extensions and len(extensions) == 1:
    return "pdf"
elif ".txt" in extensions and len(extensions) == 1:
    return "transcript"
elif ".md" in extensions and len(extensions) == 1:
    return "url"
elif ".git" in [f.name for f in dir_path.iterdir() if f.is_dir()]:
    return "git-repository"
else:
    return "multi-source"

def detect_artifacts(dir_path: Path) -> List[Dict]: """Detect artifacts in the staging directory.""" artifacts = []

# Check for pipeline artifacts directory
artifacts_dir = dir_path / "artifacts"
if artifacts_dir.exists() and artifacts_dir.is_dir():
    # Pipeline-generated artifacts
    artifact_types = {
        "executive-summary.md": "executive-summary",
        "assessment.md": "assessment",
        "sdd.md": "sdd",
        "tdd.md": "tdd",
        "c4-architecture.md": "c4-diagram",
        "glossary.md": "glossary",
        "1-2-3-detailed-quick-start.md": "quick-start",
        "mermaid-diagrams.md": "mermaid-diagrams",
        "coditect-impact.md": "coditect-impact",
    }

    for filename, artifact_type in artifact_types.items():
        artifact_path = artifacts_dir / filename
        if artifact_path.exists():
            artifacts.append({
                "type": artifact_type,
                "staging_path": f"artifacts/{filename}",
                "status": "staging"
            })

    # Check for ADRs
    adr_dir = artifacts_dir / "adrs"
    if adr_dir.exists():
        adr_count = len(list(adr_dir.glob("ADR-*.md")))
        if adr_count > 0:
            artifacts.append({
                "type": "adr",
                "staging_path": f"artifacts/adrs/ ({adr_count} files)",
                "status": "staging"
            })

# Always add raw-input artifact
artifacts.insert(0, {
    "type": "raw-input",
    "staging_path": ".",
    "status": "staging"
})

# Check for promoted artifacts in internal/analysis/
# This would require checking the promoted location, skipping for now

return artifacts if artifacts else [{"type": "raw-input", "staging_path": ".", "status": "staging"}]

def get_recommendation(dir_path: Path, category: str) -> Tuple[str, str]: """Determine recommendation and confidence based on artifacts.""" artifacts_dir = dir_path / "artifacts"

# If has pipeline artifacts, better confidence
if artifacts_dir.exists() and artifacts_dir.is_dir():
    has_exec_summary = (artifacts_dir / "executive-summary.md").exists()
    has_assessment = (artifacts_dir / "assessment.md").exists()

    if has_exec_summary or has_assessment:
        return ("MONITOR", "MEDIUM")

# Default for raw inputs
return ("MONITOR", "LOW")

def estimate_date(dirname: str) -> str: """Estimate research date. Default to 2026-01-15.""" # Could parse dates from directory names or file mtimes # For now, use default return "2026-01-15"

def generate_yaml(dirname: str, category: str, dir_path: Path) -> str: """Generate YAML manifest for a research directory.""" topic = strip_prefix(dirname) research_id = f"2026-01-15-{topic}" date_conducted = estimate_date(dirname) input_type = detect_input_type(dir_path) artifacts = detect_artifacts(dir_path) recommendation, confidence = get_recommendation(dir_path, category)

# Build YAML manually (no pyyaml dependency)
yaml_lines = [
    "# Research Manifest (ADR-207)",
    f"# Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
    "",
    "metadata:",
    f"  research_id: \"{research_id}\"",
    f"  topic: \"{dirname}\"",
    f"  category: {category}",
    f"  date_conducted: {date_conducted}",
]

# Add pipeline_version if artifacts exist
artifacts_dir = dir_path / "artifacts"
if artifacts_dir.exists():
    yaml_lines.append("  pipeline_version: \"1.0\"")

yaml_lines.extend([
    f"  recommendation: {recommendation}",
    f"  confidence: {confidence}",
    "",
    "input_sources:",
    f"  type: {input_type}",
    "  primary:",
    f"    url: \"file://analyze-new-artifacts/{dirname}/\"",
    "",
    "artifacts:",
])

for artifact in artifacts:
    yaml_lines.append(f"  - type: {artifact['type']}")
    if "staging_path" in artifact:
        yaml_lines.append(f"    staging_path: \"{artifact['staging_path']}\"")
    yaml_lines.append(f"    status: {artifact['status']}")

# Add tags based on category
tags = [category, topic.split("-")[0] if "-" in topic else topic]
yaml_lines.extend([
    "",
    "tags:",
])
for tag in tags[:3]:  # Limit to 3 tags
    yaml_lines.append(f"  - {tag}")

yaml_lines.append("")

return "\n".join(yaml_lines)

def main(): """Main entry point.""" # Paths base_dir = Path(file).parent.parent staging_dir = base_dir / "analyze-new-artifacts" manifest_dir = base_dir / "internal" / "research" / "manifests"

if not staging_dir.exists():
    print(f"ERROR: Staging directory not found: {staging_dir}")
    sys.exit(1)

# Create manifest directory
manifest_dir.mkdir(parents=True, exist_ok=True)

# Generate manifests
generated = 0
skipped = 0
errors = 0

print(f"Generating manifests for {len(CATEGORIZATION)} directories...")
print(f"Output: {manifest_dir}")
print()

for dirname, category in sorted(CATEGORIZATION.items()):
    dir_path = staging_dir / dirname

    if not dir_path.exists():
        print(f"SKIP: {dirname} (not found)")
        skipped += 1
        continue

    try:
        yaml_content = generate_yaml(dirname, category, dir_path)
        topic = strip_prefix(dirname)
        manifest_filename = f"2026-01-15-{topic}.yaml"
        manifest_path = manifest_dir / manifest_filename

        with open(manifest_path, "w") as f:
            f.write(yaml_content)

        print(f"OK: {manifest_filename} ({category})")
        generated += 1

    except Exception as e:
        print(f"ERROR: {dirname} - {e}")
        errors += 1

print()
print("=" * 60)
print(f"Generated: {generated}")
print(f"Skipped: {skipped}")
print(f"Errors: {errors}")
print(f"Total: {len(CATEGORIZATION)}")
print()

# Category breakdown
print("By Category:")
category_counts = {}
for category in CATEGORIZATION.values():
    category_counts[category] = category_counts.get(category, 0) + 1

for category, count in sorted(category_counts.items()):
    print(f"  {category}: {count}")

return 0 if errors == 0 else 1

if name == "main": sys.exit(main())