scripts-audit-markdown
#!/usr/bin/env python3 """
title: Markdown Audit Script component_type: script version: 1.0.0 author: CODITECT Framework Team summary: Audits markdown files for mermaid issues, GitHub compatibility, and formatting problems tags: ['markdown', 'audit', 'mermaid', 'documentation'] moe_confidence: 0.930 moe_classified: '2026-01-29' moe_type_expert: documentation-expert related_components:
- agent: markdown-mermaid-cleaner
- command: markdown-cleanup
- script: fix-markdown-issues
- skill: markdown-mermaid-best-practices
Markdown Audit and Fix Script
Audits markdown files for:
- Broken mermaid diagrams
- GitHub-incompatible HTML tags
- Unclosed code blocks
- Trailing whitespace
- ASCII diagrams that should be mermaid
Usage: python3 scripts/audit-markdown.py [--fix] [--report] """
import argparse import re import sys from pathlib import Path from dataclasses import dataclass, field from typing import List, Dict, Optional import json
@dataclass class MarkdownIssue: file: str line: int type: str message: str severity: str = "warning" # error, warning, info
class MarkdownAuditor: """Audits markdown files for common issues."""
def __init__(self, repo_root: Path):
self.repo_root = repo_root
self.issues: List[MarkdownIssue] = []
self.files_checked = 0
self.files_with_issues = 0
def find_markdown_files(self) -> List[Path]:
"""Find all markdown files excluding third-party."""
exclude_patterns = [
'.venv', '.git', 'context-storage',
'node_modules', '.pytest_cache', '.ruff_cache',
'.mypy_cache', 'htmlcov', 'test-results',
'__pycache__', '.DS_Store'
]
md_files = []
for md_file in self.repo_root.rglob('*.md'):
# Skip excluded directories
if any(excl in str(md_file) for excl in exclude_patterns):
continue
md_files.append(md_file)
return md_files
def check_mermaid_diagrams(self, content: str, file_path: str) -> List[MarkdownIssue]:
"""Check mermaid diagrams for common issues."""
issues = []
lines = content.split('\n')
in_mermaid = False
mermaid_start_line = 0
mermaid_content = []
for i, line in enumerate(lines, 1):
if line.strip().startswith('```mermaid'):
in_mermaid = True
mermaid_start_line = i
mermaid_content = []
elif line.strip() == '```' and in_mermaid:
in_mermaid = False
# Check mermaid content
mermaid_text = '\n'.join(mermaid_content)
# Check for HTML <br> tags (should use <br/> or nothing)
if '<br>' in mermaid_text and '<br/>' not in mermaid_text:
issues.append(MarkdownIssue(
file=file_path,
line=mermaid_start_line,
type='mermaid_html',
message='Mermaid diagram uses <br> instead of <br/> or \\n',
severity='warning'
))
# Check for unclosed quotes in node definitions
quote_count = mermaid_text.count('"')
if quote_count % 2 != 0:
issues.append(MarkdownIssue(
file=file_path,
line=mermaid_start_line,
type='mermaid_quotes',
message='Mermaid diagram has unclosed quotes',
severity='error'
))
# Check for problematic characters
if ' ' in mermaid_text:
issues.append(MarkdownIssue(
file=file_path,
line=mermaid_start_line,
type='mermaid_html_entity',
message='Mermaid diagram contains entity',
severity='warning'
))
mermaid_content = []
elif in_mermaid:
mermaid_content.append(line)
# Check for unclosed mermaid blocks
if in_mermaid:
issues.append(MarkdownIssue(
file=file_path,
line=mermaid_start_line,
type='mermaid_unclosed',
message='Mermaid diagram not closed',
severity='error'
))
return issues
def check_code_blocks(self, content: str, file_path: str) -> List[MarkdownIssue]:
"""Check code blocks are properly balanced."""
issues = []
lines = content.split('\n')
stack = []
for i, line in enumerate(lines, 1):
stripped = line.strip()
if stripped == '```':
if stack:
stack.pop()
else:
issues.append(MarkdownIssue(
file=file_path,
line=i,
type='code_unmatched_close',
message='Unmatched code block closing',
severity='error'
))
elif stripped.startswith('```'):
stack.append((stripped[3:].strip(), i))
# Check for unclosed blocks
for lang, line in stack:
issues.append(MarkdownIssue(
file=file_path,
line=line,
type='code_unclosed',
message=f'Unclosed code block: {lang}',
severity='error'
))
return issues
def check_html_compatibility(self, content: str, file_path: str) -> List[MarkdownIssue]:
"""Check for GitHub-incompatible HTML."""
issues = []
lines = content.split('\n')
# HTML tags that GitHub doesn't support well
problematic_tags = ['<br>', '</br>', '<div', '<span', '<font', '<center>']
for i, line in enumerate(lines, 1):
for tag in problematic_tags:
if tag in line.lower():
# Skip if it's inside a code block
issues.append(MarkdownIssue(
file=file_path,
line=i,
type='html_compat',
message=f'Potentially incompatible HTML: {tag}',
severity='warning'
))
return issues
def check_ascii_diagrams(self, content: str, file_path: str) -> List[MarkdownIssue]:
"""Check for ASCII diagrams that could be mermaid."""
issues = []
lines = content.split('\n')
# Look for ASCII box drawing characters
ascii_patterns = [
(r'^[┌├└│].*[┐┤┘│]$', 'box_drawing'),
(r'^.*[┌┐└┘├┤│─].*[┌┐└┘├┤│─].*$', 'box_drawing'),
(r'^\+[-]+\+$', 'ascii_table'),
(r'^\|.*\|$', 'ascii_table'),
]
in_ascii_block = False
ascii_start = 0
ascii_lines = []
for i, line in enumerate(lines, 1):
# Check for ASCII patterns
is_ascii = False
for pattern, diag_type in ascii_patterns:
if re.match(pattern, line):
is_ascii = True
break
if is_ascii:
if not in_ascii_block:
in_ascii_block = True
ascii_start = i
ascii_lines = []
ascii_lines.append(line)
elif in_ascii_block:
# End of ASCII block
if len(ascii_lines) >= 5: # Only flag substantial diagrams
issues.append(MarkdownIssue(
file=file_path,
line=ascii_start,
type='ascii_diagram',
message=f'ASCII diagram ({len(ascii_lines)} lines) could be converted to mermaid',
severity='info'
))
in_ascii_block = False
ascii_lines = []
return issues
def check_trailing_whitespace(self, content: str, file_path: str) -> List[MarkdownIssue]:
"""Check for trailing whitespace."""
issues = []
lines = content.split('\n')
for i, line in enumerate(lines, 1):
if line.endswith(' ') and line.strip():
issues.append(MarkdownIssue(
file=file_path,
line=i,
type='trailing_whitespace',
message='Line has trailing whitespace',
severity='info'
))
return issues
def audit_file(self, file_path: Path) -> List[MarkdownIssue]:
"""Audit a single markdown file."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
except Exception as e:
return [MarkdownIssue(
file=str(file_path),
line=0,
type='read_error',
message=f'Could not read file: {e}',
severity='error'
)]
issues = []
file_str = str(file_path.relative_to(self.repo_root))
issues.extend(self.check_mermaid_diagrams(content, file_str))
issues.extend(self.check_code_blocks(content, file_str))
issues.extend(self.check_html_compatibility(content, file_str))
issues.extend(self.check_ascii_diagrams(content, file_str))
issues.extend(self.check_trailing_whitespace(content, file_str))
return issues
def run_audit(self, fix: bool = False) -> Dict:
"""Run full audit on all markdown files."""
md_files = self.find_markdown_files()
print(f"Auditing {len(md_files)} markdown files...")
all_issues = []
for md_file in md_files:
self.files_checked += 1
issues = self.audit_file(md_file)
if issues:
self.files_with_issues += 1
all_issues.extend(issues)
self.issues = all_issues
# Generate report
report = {
'files_checked': self.files_checked,
'files_with_issues': self.files_with_issues,
'total_issues': len(all_issues),
'issues_by_type': {},
'issues_by_severity': {'error': 0, 'warning': 0, 'info': 0},
'issues': []
}
for issue in all_issues:
report['issues_by_type'].setdefault(issue.type, 0)
report['issues_by_type'][issue.type] += 1
report['issues_by_severity'][issue.severity] += 1
report['issues'].append({
'file': issue.file,
'line': issue.line,
'type': issue.type,
'message': issue.message,
'severity': issue.severity
})
return report
def print_report(self, report: Dict):
"""Print audit report."""
print("\n" + "="*70)
print("MARKDOWN AUDIT REPORT")
print("="*70)
print(f"\nFiles checked: {report['files_checked']}")
print(f"Files with issues: {report['files_with_issues']}")
print(f"Total issues: {report['total_issues']}")
print("\nIssues by severity:")
for severity, count in report['issues_by_severity'].items():
icon = "🔴" if severity == 'error' else "🟡" if severity == 'warning' else "🔵"
print(f" {icon} {severity.capitalize()}: {count}")
if report['issues_by_type']:
print("\nIssues by type:")
for issue_type, count in sorted(report['issues_by_type'].items(),
key=lambda x: x[1], reverse=True)[:10]:
print(f" - {issue_type}: {count}")
# Show errors
errors = [i for i in report['issues'] if i['severity'] == 'error']
if errors:
print("\n🔴 ERRORS (require fixing):")
for issue in errors[:20]: # Show first 20
print(f" {issue['file']}:{issue['line']} - {issue['message']}")
if len(errors) > 20:
print(f" ... and {len(errors) - 20} more errors")
print("\n" + "="*70)
def main(): parser = argparse.ArgumentParser(description='Audit markdown files') parser.add_argument('--fix', action='store_true', help='Fix issues automatically') parser.add_argument('--report', action='store_true', help='Generate detailed report') parser.add_argument('--output', type=str, help='Output report to file')
args = parser.parse_args()
repo_root = Path.cwd()
auditor = MarkdownAuditor(repo_root)
report = auditor.run_audit(fix=args.fix)
auditor.print_report(report)
if args.output:
with open(args.output, 'w') as f:
json.dump(report, f, indent=2)
print(f"\nReport saved to: {args.output}")
# Exit with error code if there are errors
errors = report['issues_by_severity'].get('error', 0)
if errors > 0:
sys.exit(1)
sys.exit(0)
if name == 'main': main()