#!/usr/bin/env python3 """ Documentation Link Analyzer Analyzes all markdown files for broken links, orphaned documents, and missing navigation files. """

import os import re from pathlib import Path from typing import Set, Dict, List, Tuple import json

Base directory

BASE_DIR = Path("/path/to/user/PROJECTS/coditect-rollout-master/submodules/core/coditect-core") DOCS_DIR = BASE_DIR / "docs" INTERNAL_DIR = BASE_DIR / "internal"

class DocumentationAnalyzer: def init(self): self.all_md_files: Set[Path] = set() self.linked_files: Set[Path] = set() self.broken_links: List[Tuple[Path, str, str]] = [] self.orphaned_files: Set[Path] = set() self.missing_readmes: List[Path] = [] self.missing_claude_mds: List[Path] = []

def find_all_markdown_files(self) -> Set[Path]:
    """Find all markdown files in docs/ and internal/"""
    md_files = set()
    for directory in [DOCS_DIR, INTERNAL_DIR]:
        if directory.exists():
            for md_file in directory.rglob("*.md"):
                md_files.add(md_file)
    return md_files

def extract_markdown_links(self, file_path: Path) -> List[str]:
    """Extract all markdown links from a file"""
    try:
        content = file_path.read_text(encoding='utf-8')
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return []

    # Match both [text](link) and [text]: link formats
    inline_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
    reference_pattern = r'\[([^\]]+)\]:\s*(.+)'

    links = []
    for match in re.finditer(inline_pattern, content):
        links.append(match.group(2))
    for match in re.finditer(reference_pattern, content):
        links.append(match.group(2))

    return links

def resolve_link(self, source_file: Path, link: str) -> Tuple[bool, Path]:
    """
    Resolve a link relative to source file.
    Returns (exists, resolved_path)
    """
    # Skip external links
    if link.startswith(('http://', 'https://', 'mailto:', '#')):
        return (True, None)  # External links assumed valid

    # Remove anchors
    link = link.split('#')[0]
    if not link:
        return (True, None)  # Just an anchor reference

    # Resolve relative to source file's directory
    source_dir = source_file.parent

    # Handle absolute paths from repo root
    if link.startswith('/'):
        target_path = BASE_DIR / link[1:]
    else:
        target_path = (source_dir / link).resolve()

    exists = target_path.exists()
    return (exists, target_path if not exists else None)

def check_directory_navigation(self, directory: Path) -> Tuple[bool, bool]:
    """
    Check if directory has README.md and CLAUDE.md
    Returns (has_readme, has_claude_md)
    """
    has_readme = (directory / "README.md").exists()
    has_claude = (directory / "CLAUDE.md").exists()
    return (has_readme, has_claude)

def analyze(self):
    """Run complete analysis"""
    print("Finding all markdown files...")
    self.all_md_files = self.find_all_markdown_files()
    print(f"Found {len(self.all_md_files)} markdown files")

    print("\nChecking links...")
    for md_file in self.all_md_files:
        links = self.extract_markdown_links(md_file)
        for link in links:
            exists, broken_path = self.resolve_link(md_file, link)
            if not exists and broken_path:
                self.broken_links.append((md_file, link, str(broken_path)))
            elif exists and broken_path:
                # Track linked files for orphan detection
                self.linked_files.add(broken_path)

    print(f"Found {len(self.broken_links)} broken links")

    print("\nChecking for orphaned documents...")
    # A file is orphaned if it's not linked from any other file
    # Exclude README.md and CLAUDE.md as they are navigation files
    for md_file in self.all_md_files:
        if md_file.name not in ['README.md', 'CLAUDE.md']:
            # Check if this file is linked from anywhere
            is_linked = any(
                md_file.resolve() == linked.resolve()
                for linked in self.linked_files
            )
            if not is_linked:
                self.orphaned_files.add(md_file)

    print(f"Found {len(self.orphaned_files)} orphaned files")

    print("\nChecking for missing navigation files...")
    directories_to_check = set()

    # Get all directories containing markdown files
    for md_file in self.all_md_files:
        directories_to_check.add(md_file.parent)

    # Check each directory for README.md and CLAUDE.md
    for directory in directories_to_check:
        # Skip archive directories
        if 'archive' in str(directory).lower():
            continue

        has_readme, has_claude = self.check_directory_navigation(directory)

        # Major directories should have both README and CLAUDE.md
        is_major_dir = (
            directory == DOCS_DIR or
            directory == INTERNAL_DIR or
            directory.parent == DOCS_DIR or
            directory.parent == INTERNAL_DIR or
            directory.parent.parent == INTERNAL_DIR
        )

        if not has_readme:
            self.missing_readmes.append(directory)

        if is_major_dir and not has_claude:
            self.missing_claude_mds.append(directory)

    print(f"Found {len(self.missing_readmes)} directories missing README.md")
    print(f"Found {len(self.missing_claude_mds)} major directories missing CLAUDE.md")

def generate_report(self) -> str:
    """Generate comprehensive markdown report"""
    report_lines = [
        "# Documentation Analysis Report",
        "",
        f"**Analysis Date:** {Path.cwd()}",
        f"**Total Documents:** {len(self.all_md_files)}",
        "",
        "## Summary",
        "",
        f"- **Broken Links:** {len(self.broken_links)}",
        f"- **Orphaned Documents:** {len(self.orphaned_files)}",
        f"- **Directories Missing README.md:** {len(self.missing_readmes)}",
        f"- **Major Directories Missing CLAUDE.md:** {len(self.missing_claude_mds)}",
        "",
    ]

    # Broken Links Section
    if self.broken_links:
        report_lines.extend([
            "## 1. Broken Links",
            "",
            "Files with links to non-existent documents:",
            "",
        ])

        # Group by source file
        by_file = {}
        for source, link, target in self.broken_links:
            rel_source = source.relative_to(BASE_DIR)
            if rel_source not in by_file:
                by_file[rel_source] = []
            by_file[rel_source].append((link, target))

        for source in sorted(by_file.keys()):
            report_lines.append(f"### {source}")
            report_lines.append("")
            for link, target in by_file[source]:
                report_lines.append(f"- **Link:** `{link}`")
                report_lines.append(f"  - **Resolves to:** `{target}`")
                report_lines.append(f"  - **Status:** Does not exist")
            report_lines.append("")
    else:
        report_lines.extend([
            "## 1. Broken Links",
            "",
            "No broken links found! ✓",
            "",
        ])

    # Orphaned Documents Section
    if self.orphaned_files:
        report_lines.extend([
            "## 2. Orphaned Documents",
            "",
            "Documents not linked from any README or index file:",
            "",
        ])

        # Group by directory
        by_dir = {}
        for orphan in self.orphaned_files:
            rel_path = orphan.relative_to(BASE_DIR)
            dir_path = rel_path.parent
            if dir_path not in by_dir:
                by_dir[dir_path] = []
            by_dir[dir_path].append(rel_path.name)

        for dir_path in sorted(by_dir.keys()):
            report_lines.append(f"### {dir_path}")
            report_lines.append("")
            for filename in sorted(by_dir[dir_path]):
                report_lines.append(f"- {filename}")
            report_lines.append("")
    else:
        report_lines.extend([
            "## 2. Orphaned Documents",
            "",
            "No orphaned documents found! ✓",
            "",
        ])

    # Missing READMEs Section
    if self.missing_readmes:
        report_lines.extend([
            "## 3. Directories Missing README.md",
            "",
            "Directories containing markdown files but no README.md:",
            "",
        ])
        for directory in sorted(self.missing_readmes):
            rel_dir = directory.relative_to(BASE_DIR)
            file_count = len(list(directory.glob("*.md")))
            report_lines.append(f"- `{rel_dir}/` ({file_count} markdown files)")
        report_lines.append("")
    else:
        report_lines.extend([
            "## 3. Directories Missing README.md",
            "",
            "All directories have README.md files! ✓",
            "",
        ])

    # Missing CLAUDE.md Section
    if self.missing_claude_mds:
        report_lines.extend([
            "## 4. Major Directories Missing CLAUDE.md",
            "",
            "Major directories that should have CLAUDE.md for AI agent context:",
            "",
        ])
        for directory in sorted(self.missing_claude_mds):
            rel_dir = directory.relative_to(BASE_DIR)
            report_lines.append(f"- `{rel_dir}/`")
        report_lines.append("")
    else:
        report_lines.extend([
            "## 4. Major Directories Missing CLAUDE.md",
            "",
            "All major directories have CLAUDE.md files! ✓",
            "",
        ])

    # Recommendations Section
    report_lines.extend([
        "## Recommendations",
        "",
        "### High Priority Fixes",
        "",
    ])

    if self.broken_links:
        report_lines.append(f"1. **Fix {len(self.broken_links)} broken links** - Update or remove invalid references")

    if self.missing_readmes:
        report_lines.append(f"2. **Create {len(self.missing_readmes)} README.md files** - Add navigation to undocumented directories")

    if self.missing_claude_mds:
        report_lines.append(f"3. **Create {len(self.missing_claude_mds)} CLAUDE.md files** - Add AI agent context to major directories")

    if self.orphaned_files:
        report_lines.append(f"4. **Link or archive {len(self.orphaned_files)} orphaned documents** - Add to README or move to archive/")

    if not (self.broken_links or self.missing_readmes or self.missing_claude_mds or self.orphaned_files):
        report_lines.append("**No issues found!** Documentation structure is healthy. ✓")

    report_lines.append("")

    return "\n".join(report_lines)

def save_json_report(self, output_path: Path):
    """Save detailed JSON report for programmatic processing"""
    report_data = {
        "summary": {
            "total_documents": len(self.all_md_files),
            "broken_links": len(self.broken_links),
            "orphaned_files": len(self.orphaned_files),
            "missing_readmes": len(self.missing_readmes),
            "missing_claude_mds": len(self.missing_claude_mds),
        },
        "broken_links": [
            {
                "source": str(source.relative_to(BASE_DIR)),
                "link": link,
                "target": target,
            }
            for source, link, target in self.broken_links
        ],
        "orphaned_files": [
            str(f.relative_to(BASE_DIR)) for f in sorted(self.orphaned_files)
        ],
        "missing_readmes": [
            str(d.relative_to(BASE_DIR)) for d in sorted(self.missing_readmes)
        ],
        "missing_claude_mds": [
            str(d.relative_to(BASE_DIR)) for d in sorted(self.missing_claude_mds)
        ],
    }

    with open(output_path, 'w') as f:
        json.dump(report_data, f, indent=2)

def main(): print("=" * 80) print("CODITECT Documentation Link Analyzer") print("=" * 80) print()

analyzer = DocumentationAnalyzer()
analyzer.analyze()

# Generate reports
markdown_report = analyzer.generate_report()

# Save reports
output_dir = BASE_DIR / "internal" / "project" / "reports"
output_dir.mkdir(parents=True, exist_ok=True)

md_report_path = output_dir / "DOCUMENTATION-LINK-ANALYSIS.md"
json_report_path = output_dir / "documentation-link-analysis.json"

with open(md_report_path, 'w') as f:
    f.write(markdown_report)

analyzer.save_json_report(json_report_path)

print("\n" + "=" * 80)
print("ANALYSIS COMPLETE")
print("=" * 80)
print(f"\nMarkdown Report: {md_report_path}")
print(f"JSON Report: {json_report_path}")
print()

# Print summary
print(markdown_report)

if name == "main": main()