scripts-analyze-internal-docs
#!/usr/bin/env python3 """
title: "Base directory" component_type: script version: "1.0.0" audience: contributor status: stable summary: "Analyze internal/ directory structure for documentation reorganization." keywords: ['analysis', 'analyze', 'api', 'deployment', 'docker'] tokens: ~500 created: 2025-12-22 updated: 2025-12-22 script_name: "analyze-internal-docs.py" language: python executable: true usage: "python3 scripts/analyze-internal-docs.py [options]" python_version: "3.10+" dependencies: [] modifies_files: false network_access: false requires_auth: false
Analyze internal/ directory structure for documentation reorganization.
Generates comprehensive report on:
- Directory structure and file counts
- Content categorization
- Agentic frontmatter presence
- Consolidation opportunities """
import os import json from pathlib import Path from collections import defaultdict import re
Base directory
BASE_DIR = Path("/path/to/user/PROJECTS/coditect-rollout-master/submodules/core/coditect-core") INTERNAL_DIR = BASE_DIR / "internal"
def has_yaml_frontmatter(file_path): """Check if file has YAML frontmatter.""" try: with open(file_path, 'r', encoding='utf-8') as f: lines = f.readlines() if len(lines) < 2: return False, {}
if lines[0].strip() == '---':
# Find closing ---
frontmatter_lines = []
for i, line in enumerate(lines[1:], 1):
if line.strip() == '---':
# Parse frontmatter
fm_text = ''.join(frontmatter_lines)
fm_data = {}
for fm_line in frontmatter_lines:
if ':' in fm_line:
key, value = fm_line.split(':', 1)
fm_data[key.strip()] = value.strip().strip('"').strip("'")
return True, fm_data
frontmatter_lines.append(line)
return False, {}
except Exception as e:
return False, {}
def get_file_category(file_path): """Categorize file by topic.""" path_str = str(file_path).lower()
# Architecture
if 'adr' in path_str or 'architecture' in path_str or 'c4-diagram' in path_str:
return 'architecture'
# Deployment
if 'deploy' in path_str or 'docker' in path_str or 'ci-cd' in path_str:
return 'deployment'
# Research
if 'research' in path_str:
return 'research'
# Project Management
if 'project' in path_str or 'plan' in path_str or 'tasklist' in path_str or 'sprint' in path_str:
return 'project'
# Testing
if 'test' in path_str or 'qa' in path_str:
return 'testing'
return 'other'
def analyze_directory(): """Analyze internal/ directory structure."""
results = {
'summary': {
'total_files': 0,
'total_markdown': 0,
'total_directories': 0
},
'by_directory': {},
'by_category': defaultdict(list),
'frontmatter': {
'with_frontmatter': 0,
'without_frontmatter': 0,
'agentic_ready': 0,
'files_with_frontmatter': [],
'files_without_frontmatter': []
},
'consolidation_opportunities': []
}
# Walk the directory
for root, dirs, files in os.walk(INTERNAL_DIR):
rel_root = Path(root).relative_to(INTERNAL_DIR)
results['summary']['total_directories'] += len(dirs)
md_files = [f for f in files if f.endswith('.md')]
results['summary']['total_files'] += len(files)
results['summary']['total_markdown'] += len(md_files)
if md_files:
dir_key = str(rel_root) if str(rel_root) != '.' else 'root'
results['by_directory'][dir_key] = {
'count': len(md_files),
'files': md_files
}
# Analyze each markdown file
for md_file in md_files:
file_path = Path(root) / md_file
rel_path = file_path.relative_to(INTERNAL_DIR)
# Check frontmatter
has_fm, fm_data = has_yaml_frontmatter(file_path)
if has_fm:
results['frontmatter']['with_frontmatter'] += 1
results['frontmatter']['files_with_frontmatter'].append(str(rel_path))
# Check if agentic-ready (has audience, tokens, summary, keywords)
agentic_keys = ['audience', 'tokens', 'summary', 'when_to_read', 'keywords']
if any(k in fm_data for k in agentic_keys):
results['frontmatter']['agentic_ready'] += 1
else:
results['frontmatter']['without_frontmatter'] += 1
results['frontmatter']['files_without_frontmatter'].append(str(rel_path))
# Categorize
category = get_file_category(file_path)
results['by_category'][category].append(str(rel_path))
# Identify consolidation opportunities
for dir_name, dir_data in results['by_directory'].items():
if dir_data['count'] > 10:
results['consolidation_opportunities'].append({
'directory': dir_name,
'file_count': dir_data['count'],
'recommendation': f"Review {dir_name} ({dir_data['count']} files) for consolidation"
})
return results
def generate_report(results): """Generate markdown report."""
report = []
report.append("# CODITECT Internal Documentation Analysis Report")
report.append("")
report.append(f"**Generated:** {Path(__file__).name}")
report.append(f"**Target:** `internal/` directory")
report.append("")
report.append("---")
report.append("")
# Summary
report.append("## 1. Directory Structure Analysis")
report.append("")
report.append(f"- **Total Files:** {results['summary']['total_files']}")
report.append(f"- **Markdown Files:** {results['summary']['total_markdown']}")
report.append(f"- **Directories:** {results['summary']['total_directories']}")
report.append("")
# Top directories by file count
report.append("### Top Directories by File Count")
report.append("")
sorted_dirs = sorted(results['by_directory'].items(), key=lambda x: x[1]['count'], reverse=True)
report.append("| Directory | Files |")
report.append("|-----------|-------|")
for dir_name, dir_data in sorted_dirs[:20]:
report.append(f"| `{dir_name}` | {dir_data['count']} |")
report.append("")
# Content categories
report.append("## 2. Content Categories")
report.append("")
for category, files in results['by_category'].items():
report.append(f"### {category.capitalize()} ({len(files)} files)")
report.append("")
report.append("<details>")
report.append(f"<summary>Show {len(files)} files</summary>")
report.append("")
for f in sorted(files)[:50]: # Limit to 50
report.append(f"- `{f}`")
if len(files) > 50:
report.append(f"- ... and {len(files) - 50} more")
report.append("")
report.append("</details>")
report.append("")
# Frontmatter analysis
report.append("## 3. Agentic Frontmatter Assessment")
report.append("")
total_md = results['summary']['total_markdown']
with_fm = results['frontmatter']['with_frontmatter']
agentic = results['frontmatter']['agentic_ready']
report.append(f"- **Files with YAML frontmatter:** {with_fm} ({with_fm/total_md*100:.1f}%)")
report.append(f"- **Files without frontmatter:** {results['frontmatter']['without_frontmatter']} ({results['frontmatter']['without_frontmatter']/total_md*100:.1f}%)")
report.append(f"- **Agentic-ready files:** {agentic} ({agentic/total_md*100:.1f}%)")
report.append("")
# Consolidation opportunities
report.append("## 4. Consolidation Opportunities")
report.append("")
report.append(f"**Target:** Reduce from {total_md} to ~50-80 well-organized files (80-85% reduction)")
report.append("")
if results['consolidation_opportunities']:
report.append("### High-Priority Directories (>10 files)")
report.append("")
report.append("| Directory | Files | Recommendation |")
report.append("|-----------|-------|----------------|")
for opp in results['consolidation_opportunities']:
report.append(f"| `{opp['directory']}` | {opp['file_count']} | {opp['recommendation']} |")
report.append("")
# Specific recommendations
report.append("## 5. Specific Recommendations")
report.append("")
# Project management
proj_files = [f for f in results['by_category']['project'] if 'v2/' not in f]
if len(proj_files) > 30:
report.append("### Project Management")
report.append("")
report.append(f"- **Current:** {len(proj_files)} files (excluding v2/)")
report.append("- **Target:** ~10 files")
report.append("- **Action:** Consolidate legacy plans, sprints, and orchestration docs")
report.append("")
# Research
research_files = results['by_category']['research']
if len(research_files) > 50:
report.append("### Research Documentation")
report.append("")
report.append(f"- **Current:** {len(research_files)} files")
report.append("- **Target:** ~15-20 files")
report.append("- **Action:** Archive historical research, consolidate by topic")
report.append("")
# Architecture
arch_files = results['by_category']['architecture']
report.append("### Architecture Documentation")
report.append("")
report.append(f"- **Current:** {len(arch_files)} files")
report.append("- **Status:** ADRs should NOT be consolidated (permanent record)")
report.append("- **Action:** Create index files, archive superseded diagrams")
report.append("")
return "\n".join(report)
def main(): """Main execution.""" print("Analyzing internal/ directory...") results = analyze_directory()
# Save JSON
json_path = BASE_DIR / "internal-docs-analysis.json"
with open(json_path, 'w') as f:
json.dump(results, f, indent=2, default=str)
print(f"✓ Saved JSON analysis: {json_path}")
# Generate report
report = generate_report(results)
# Save report
report_path = BASE_DIR / "internal-docs-analysis-report.md"
with open(report_path, 'w') as f:
f.write(report)
print(f"✓ Saved report: {report_path}")
# Print summary
print("\n" + "="*60)
print("SUMMARY")
print("="*60)
print(f"Total markdown files: {results['summary']['total_markdown']}")
print(f"With frontmatter: {results['frontmatter']['with_frontmatter']} ({results['frontmatter']['with_frontmatter']/results['summary']['total_markdown']*100:.1f}%)")
print(f"Agentic-ready: {results['frontmatter']['agentic_ready']} ({results['frontmatter']['agentic_ready']/results['summary']['total_markdown']*100:.1f}%)")
print(f"\nConsolidation opportunities: {len(results['consolidation_opportunities'])}")
print(f"Target reduction: {results['summary']['total_markdown']} → ~60 files (85% reduction)")
if name == 'main': main()