#!/usr/bin/env python3 """ Documentation Link Analyzer Analyzes all markdown files for broken links, orphaned documents, and missing navigation files. """
import os import re from pathlib import Path from typing import Set, Dict, List, Tuple import json
Base directory
BASE_DIR = Path("/path/to/user/PROJECTS/coditect-rollout-master/submodules/core/coditect-core") DOCS_DIR = BASE_DIR / "docs" INTERNAL_DIR = BASE_DIR / "internal"
class DocumentationAnalyzer: def init(self): self.all_md_files: Set[Path] = set() self.linked_files: Set[Path] = set() self.broken_links: List[Tuple[Path, str, str]] = [] self.orphaned_files: Set[Path] = set() self.missing_readmes: List[Path] = [] self.missing_claude_mds: List[Path] = []
def find_all_markdown_files(self) -> Set[Path]:
"""Find all markdown files in docs/ and internal/"""
md_files = set()
for directory in [DOCS_DIR, INTERNAL_DIR]:
if directory.exists():
for md_file in directory.rglob("*.md"):
md_files.add(md_file)
return md_files
def extract_markdown_links(self, file_path: Path) -> List[str]:
"""Extract all markdown links from a file"""
try:
content = file_path.read_text(encoding='utf-8')
except Exception as e:
print(f"Error reading {file_path}: {e}")
return []
# Match both [text](link) and [text]: link formats
inline_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
reference_pattern = r'\[([^\]]+)\]:\s*(.+)'
links = []
for match in re.finditer(inline_pattern, content):
links.append(match.group(2))
for match in re.finditer(reference_pattern, content):
links.append(match.group(2))
return links
def resolve_link(self, source_file: Path, link: str) -> Tuple[bool, Path]:
"""
Resolve a link relative to source file.
Returns (exists, resolved_path)
"""
# Skip external links
if link.startswith(('http://', 'https://', 'mailto:', '#')):
return (True, None) # External links assumed valid
# Remove anchors
link = link.split('#')[0]
if not link:
return (True, None) # Just an anchor reference
# Resolve relative to source file's directory
source_dir = source_file.parent
# Handle absolute paths from repo root
if link.startswith('/'):
target_path = BASE_DIR / link[1:]
else:
target_path = (source_dir / link).resolve()
exists = target_path.exists()
return (exists, target_path if not exists else None)
def check_directory_navigation(self, directory: Path) -> Tuple[bool, bool]:
"""
Check if directory has README.md and CLAUDE.md
Returns (has_readme, has_claude_md)
"""
has_readme = (directory / "README.md").exists()
has_claude = (directory / "CLAUDE.md").exists()
return (has_readme, has_claude)
def analyze(self):
"""Run complete analysis"""
print("Finding all markdown files...")
self.all_md_files = self.find_all_markdown_files()
print(f"Found {len(self.all_md_files)} markdown files")
print("\nChecking links...")
for md_file in self.all_md_files:
links = self.extract_markdown_links(md_file)
for link in links:
exists, broken_path = self.resolve_link(md_file, link)
if not exists and broken_path:
self.broken_links.append((md_file, link, str(broken_path)))
elif exists and broken_path:
# Track linked files for orphan detection
self.linked_files.add(broken_path)
print(f"Found {len(self.broken_links)} broken links")
print("\nChecking for orphaned documents...")
# A file is orphaned if it's not linked from any other file
# Exclude README.md and CLAUDE.md as they are navigation files
for md_file in self.all_md_files:
if md_file.name not in ['README.md', 'CLAUDE.md']:
# Check if this file is linked from anywhere
is_linked = any(
md_file.resolve() == linked.resolve()
for linked in self.linked_files
)
if not is_linked:
self.orphaned_files.add(md_file)
print(f"Found {len(self.orphaned_files)} orphaned files")
print("\nChecking for missing navigation files...")
directories_to_check = set()
# Get all directories containing markdown files
for md_file in self.all_md_files:
directories_to_check.add(md_file.parent)
# Check each directory for README.md and CLAUDE.md
for directory in directories_to_check:
# Skip archive directories
if 'archive' in str(directory).lower():
continue
has_readme, has_claude = self.check_directory_navigation(directory)
# Major directories should have both README and CLAUDE.md
is_major_dir = (
directory == DOCS_DIR or
directory == INTERNAL_DIR or
directory.parent == DOCS_DIR or
directory.parent == INTERNAL_DIR or
directory.parent.parent == INTERNAL_DIR
)
if not has_readme:
self.missing_readmes.append(directory)
if is_major_dir and not has_claude:
self.missing_claude_mds.append(directory)
print(f"Found {len(self.missing_readmes)} directories missing README.md")
print(f"Found {len(self.missing_claude_mds)} major directories missing CLAUDE.md")
def generate_report(self) -> str:
"""Generate comprehensive markdown report"""
report_lines = [
"# Documentation Analysis Report",
"",
f"**Analysis Date:** {Path.cwd()}",
f"**Total Documents:** {len(self.all_md_files)}",
"",
"## Summary",
"",
f"- **Broken Links:** {len(self.broken_links)}",
f"- **Orphaned Documents:** {len(self.orphaned_files)}",
f"- **Directories Missing README.md:** {len(self.missing_readmes)}",
f"- **Major Directories Missing CLAUDE.md:** {len(self.missing_claude_mds)}",
"",
]
# Broken Links Section
if self.broken_links:
report_lines.extend([
"## 1. Broken Links",
"",
"Files with links to non-existent documents:",
"",
])
# Group by source file
by_file = {}
for source, link, target in self.broken_links:
rel_source = source.relative_to(BASE_DIR)
if rel_source not in by_file:
by_file[rel_source] = []
by_file[rel_source].append((link, target))
for source in sorted(by_file.keys()):
report_lines.append(f"### {source}")
report_lines.append("")
for link, target in by_file[source]:
report_lines.append(f"- **Link:** `{link}`")
report_lines.append(f" - **Resolves to:** `{target}`")
report_lines.append(f" - **Status:** Does not exist")
report_lines.append("")
else:
report_lines.extend([
"## 1. Broken Links",
"",
"No broken links found! ✓",
"",
])
# Orphaned Documents Section
if self.orphaned_files:
report_lines.extend([
"## 2. Orphaned Documents",
"",
"Documents not linked from any README or index file:",
"",
])
# Group by directory
by_dir = {}
for orphan in self.orphaned_files:
rel_path = orphan.relative_to(BASE_DIR)
dir_path = rel_path.parent
if dir_path not in by_dir:
by_dir[dir_path] = []
by_dir[dir_path].append(rel_path.name)
for dir_path in sorted(by_dir.keys()):
report_lines.append(f"### {dir_path}")
report_lines.append("")
for filename in sorted(by_dir[dir_path]):
report_lines.append(f"- {filename}")
report_lines.append("")
else:
report_lines.extend([
"## 2. Orphaned Documents",
"",
"No orphaned documents found! ✓",
"",
])
# Missing READMEs Section
if self.missing_readmes:
report_lines.extend([
"## 3. Directories Missing README.md",
"",
"Directories containing markdown files but no README.md:",
"",
])
for directory in sorted(self.missing_readmes):
rel_dir = directory.relative_to(BASE_DIR)
file_count = len(list(directory.glob("*.md")))
report_lines.append(f"- `{rel_dir}/` ({file_count} markdown files)")
report_lines.append("")
else:
report_lines.extend([
"## 3. Directories Missing README.md",
"",
"All directories have README.md files! ✓",
"",
])
# Missing CLAUDE.md Section
if self.missing_claude_mds:
report_lines.extend([
"## 4. Major Directories Missing CLAUDE.md",
"",
"Major directories that should have CLAUDE.md for AI agent context:",
"",
])
for directory in sorted(self.missing_claude_mds):
rel_dir = directory.relative_to(BASE_DIR)
report_lines.append(f"- `{rel_dir}/`")
report_lines.append("")
else:
report_lines.extend([
"## 4. Major Directories Missing CLAUDE.md",
"",
"All major directories have CLAUDE.md files! ✓",
"",
])
# Recommendations Section
report_lines.extend([
"## Recommendations",
"",
"### High Priority Fixes",
"",
])
if self.broken_links:
report_lines.append(f"1. **Fix {len(self.broken_links)} broken links** - Update or remove invalid references")
if self.missing_readmes:
report_lines.append(f"2. **Create {len(self.missing_readmes)} README.md files** - Add navigation to undocumented directories")
if self.missing_claude_mds:
report_lines.append(f"3. **Create {len(self.missing_claude_mds)} CLAUDE.md files** - Add AI agent context to major directories")
if self.orphaned_files:
report_lines.append(f"4. **Link or archive {len(self.orphaned_files)} orphaned documents** - Add to README or move to archive/")
if not (self.broken_links or self.missing_readmes or self.missing_claude_mds or self.orphaned_files):
report_lines.append("**No issues found!** Documentation structure is healthy. ✓")
report_lines.append("")
return "\n".join(report_lines)
def save_json_report(self, output_path: Path):
"""Save detailed JSON report for programmatic processing"""
report_data = {
"summary": {
"total_documents": len(self.all_md_files),
"broken_links": len(self.broken_links),
"orphaned_files": len(self.orphaned_files),
"missing_readmes": len(self.missing_readmes),
"missing_claude_mds": len(self.missing_claude_mds),
},
"broken_links": [
{
"source": str(source.relative_to(BASE_DIR)),
"link": link,
"target": target,
}
for source, link, target in self.broken_links
],
"orphaned_files": [
str(f.relative_to(BASE_DIR)) for f in sorted(self.orphaned_files)
],
"missing_readmes": [
str(d.relative_to(BASE_DIR)) for d in sorted(self.missing_readmes)
],
"missing_claude_mds": [
str(d.relative_to(BASE_DIR)) for d in sorted(self.missing_claude_mds)
],
}
with open(output_path, 'w') as f:
json.dump(report_data, f, indent=2)
def main(): print("=" * 80) print("CODITECT Documentation Link Analyzer") print("=" * 80) print()
analyzer = DocumentationAnalyzer()
analyzer.analyze()
# Generate reports
markdown_report = analyzer.generate_report()
# Save reports
output_dir = BASE_DIR / "internal" / "project" / "reports"
output_dir.mkdir(parents=True, exist_ok=True)
md_report_path = output_dir / "DOCUMENTATION-LINK-ANALYSIS.md"
json_report_path = output_dir / "documentation-link-analysis.json"
with open(md_report_path, 'w') as f:
f.write(markdown_report)
analyzer.save_json_report(json_report_path)
print("\n" + "=" * 80)
print("ANALYSIS COMPLETE")
print("=" * 80)
print(f"\nMarkdown Report: {md_report_path}")
print(f"JSON Report: {json_report_path}")
print()
# Print summary
print(markdown_report)
if name == "main": main()