Skip to main content

#!/usr/bin/env python3 """ CODITECT Skill Resource Extractor

Extracts large sections from SKILL.md into resource files to fix:

  • B3_level3_resources: Skills >300 lines need extra .md files
  • B1_under_500_lines: Skills body must be <=500 lines

Strategy: Find the largest H2 section(s) and extract to separate .md files, leaving a brief reference in SKILL.md.

Usage: python3 scripts/qa/fix-skill-resources.py [--dry-run] [--verbose] [--path skills/]

ADR-161: Component Quality Assurance Framework Track: E.14 (QA Remediation) """

import os import sys import re import argparse from pathlib import Path

CODITECT_CORE = Path(file).resolve().parents[2] SKILLS_DIR = CODITECT_CORE / "skills"

FRONTMATTER_RE = re.compile(r'^(---\s*\n)(.?)(\n---\s\n)', re.DOTALL)

Sections that should NOT be extracted (core structure)

KEEP_SECTIONS = { 'table of contents', 'toc', 'contents', 'purpose', 'overview', 'when to use', 'steps', 'quick start', 'integration', 'related', 'principles', 'anti-patterns', 'best practices', }

Resource file name mappings based on section content

RESOURCE_NAMES = { 'example': 'examples.md', 'examples': 'examples.md', 'detailed example': 'examples.md', 'detailed examples': 'examples.md', 'code example': 'examples.md', 'implementation': 'implementation.md', 'implementation details': 'implementation.md', 'implementation guide': 'implementation.md', 'reference': 'reference.md', 'api reference': 'reference.md', 'configuration': 'configuration.md', 'configuration options': 'configuration.md', 'advanced configuration': 'configuration.md', 'workflow': 'workflow.md', 'workflows': 'workflow.md', 'advanced': 'advanced.md', 'advanced usage': 'advanced.md', 'advanced topics': 'advanced.md', 'patterns': 'patterns.md', 'design patterns': 'patterns.md', 'troubleshooting': 'troubleshooting.md', 'templates': 'templates.md', 'template': 'templates.md', }

def slugify(text): """Convert section title to a safe filename.""" slug = text.lower().strip() slug = re.sub(r'[^a-z0-9\s-]', '', slug) slug = re.sub(r'\s+', '-', slug) slug = slug.strip('-') return slug + '.md'

def get_resource_name(section_title, existing_names): """Determine resource file name from section title.""" title_lower = section_title.lower().strip()

# Check known mappings
for pattern, name in RESOURCE_NAMES.items():
if pattern in title_lower:
if name not in existing_names:
return name
# Add suffix if name taken
base = name.replace('.md', '')
for i in range(2, 10):
candidate = f"{base}-{i}.md"
if candidate not in existing_names:
return candidate

# Generate from title
name = slugify(section_title)
if name not in existing_names:
return name

base = name.replace('.md', '')
for i in range(2, 10):
candidate = f"{base}-{i}.md"
if candidate not in existing_names:
return candidate

return f"RESOURCE-{len(existing_names) + 1}.md"

def parse_h2_sections(body): """Parse body into H2 sections with positions and line counts.""" sections = [] pattern = re.compile(r'^(##\s+.+)$', re.MULTILINE) matches = list(pattern.finditer(body))

for i, match in enumerate(matches):
title = match.group(1).replace('## ', '').strip()
start = match.start()
end = matches[i + 1].start() if i + 1 < len(matches) else len(body)
content = body[start:end]
line_count = content.count('\n')
sections.append({
'title': title,
'title_lower': title.lower().strip(),
'start': start,
'end': end,
'content': content,
'lines': line_count,
})

return sections

def should_keep(section): """Check if section should be kept in SKILL.md.""" title_lower = section['title_lower'] for keep in KEEP_SECTIONS: if keep in title_lower: return True return False

def extract_frontmatter_fields(fm_text): """Extract key fields from frontmatter text.""" fields = {} try: import yaml fm = yaml.safe_load(fm_text) or {} fields['title'] = fm.get('title', '') fields['name'] = fm.get('name', '') fields['type'] = fm.get('type', 'skill') fields['component_type'] = fm.get('component_type', 'skill') fields['version'] = fm.get('version', '1.0.0') fields['description'] = fm.get('description', '') except Exception: pass return fields

def generate_resource_frontmatter(section_title, parent_fields): """Generate frontmatter for a resource file.""" lines = [ "---", f"title: {section_title}", "type: skill", "component_type: skill", f"version: {parent_fields.get('version', '1.0.0')}", "audience: contributor", "status: active", f"summary: {section_title} reference for {parent_fields.get('name', 'this skill')}", "---", ] return '\n'.join(lines) + '\n'

def generate_reference_line(section_title, resource_name): """Generate a reference line to replace extracted section.""" anchor = slugify(section_title).replace('.md', '').lower() return f"\nSee {resource_name} for detailed {section_title.lower()} content.\n\n"

def fix_skill_resources(skill_dir, dry_run=False, verbose=False, target_lines=480): """Extract large sections from SKILL.md into resource files.""" skill_name = skill_dir.name skill_file = skill_dir / "SKILL.md"

if not skill_file.exists():
return None

with open(skill_file, 'r', encoding='utf-8', errors='replace') as f:
content = f.read()

fm_match = FRONTMATTER_RE.match(content)
if not fm_match:
return None

fm_text = fm_match.group(2)
body = content[fm_match.end():]
line_count = len(body.split('\n'))

# Check if already has extra files
has_extra = any(
f.suffix == '.md' and f.name != 'SKILL.md'
for f in skill_dir.iterdir() if f.is_file()
)

needs_b3 = line_count > 300 and not has_extra
needs_b1 = line_count > 500

if not needs_b3 and not needs_b1:
return None

# Parse sections
sections = parse_h2_sections(body)
if not sections:
return None

# Find extractable sections (not in KEEP list), sorted by size desc
extractable = [s for s in sections if not should_keep(s)]
extractable.sort(key=lambda s: s['lines'], reverse=True)

if not extractable:
# If no extractable sections, try extracting the largest non-TOC/Steps/Integration/Principles
# Fall back to extracting the single largest section
all_sorted = sorted(sections, key=lambda s: s['lines'], reverse=True)
for s in all_sorted:
if s['title_lower'] not in ('table of contents', 'toc', 'steps', 'integration', 'principles'):
extractable = [s]
break

if not extractable:
return None

# Determine how many sections to extract
parent_fields = extract_frontmatter_fields(fm_text)
existing_names = {'SKILL.md'}
changes = []
extractions = [] # (section, resource_name, resource_content)
remaining_lines = line_count

for section in extractable:
if remaining_lines <= target_lines and has_extra:
break # Already under target and has resources

if section['lines'] < 15:
continue # Too small to bother extracting

resource_name = get_resource_name(section['title'], existing_names)
existing_names.add(resource_name)

# Build resource file content
resource_fm = generate_resource_frontmatter(section['title'], parent_fields)
resource_content = resource_fm + section['content']

extractions.append({
'section': section,
'resource_name': resource_name,
'resource_content': resource_content,
})

remaining_lines -= section['lines']
has_extra = True
changes.append(f"Extracted '{section['title']}' ({section['lines']} lines) -> {resource_name}")

if remaining_lines <= target_lines:
break

if not extractions:
return None

# Build new body by replacing extracted sections with references
# Work backwards to preserve positions
new_body = body
for ext in sorted(extractions, key=lambda e: e['section']['start'], reverse=True):
section = ext['section']
ref_line = generate_reference_line(section['title'], ext['resource_name'])
# Keep the H2 heading, replace content with reference
heading_end = new_body.index('\n', section['start']) + 1
new_body = new_body[:heading_end] + ref_line + new_body[section['end']:]

new_line_count = len(new_body.split('\n'))

if not dry_run:
# Write resource files
for ext in extractions:
resource_path = skill_dir / ext['resource_name']
with open(resource_path, 'w', encoding='utf-8') as f:
f.write(ext['resource_content'])

# Write updated SKILL.md
new_content = content[:fm_match.end()] + new_body
with open(skill_file, 'w', encoding='utf-8') as f:
f.write(new_content)

return {
'name': skill_name,
'changes': changes,
'old_lines': line_count,
'new_lines': new_line_count,
'files_created': len(extractions),
'dry_run': dry_run,
}

def main(): parser = argparse.ArgumentParser( description='Extract large sections from skills into resource files (B3/B1 fix)') parser.add_argument('--path', default=str(SKILLS_DIR), help='Skills directory (default: skills/)') parser.add_argument('--dry-run', '-n', action='store_true', help='Preview changes without writing') parser.add_argument('--verbose', '-v', action='store_true', help='Show details for each skill') parser.add_argument('--target-lines', type=int, default=480, help='Target max lines for SKILL.md body (default: 480)') args = parser.parse_args()

target = Path(args.path)
skill_dirs = sorted([
d for d in target.iterdir()
if d.is_dir() and (d / 'SKILL.md').exists()
])

fixed = 0
skipped = 0
total = len(skill_dirs)
files_created = 0

print(f"{'[DRY RUN] ' if args.dry_run else ''}Scanning {total} skills for resource extraction...")
print(f"{'=' * 60}")

for d in skill_dirs:
result = fix_skill_resources(d, dry_run=args.dry_run, verbose=args.verbose,
target_lines=args.target_lines)
if result is None:
skipped += 1
continue

fixed += 1
files_created += result['files_created']
if args.verbose or args.dry_run:
prefix = "[DRY] " if args.dry_run else "FIXED"
print(f" {prefix} {result['name']} ({result['old_lines']} -> {result['new_lines']} lines):")
for change in result['changes']:
print(f" {change}")

print(f"\n{'=' * 60}")
print(f"{'[DRY RUN] ' if args.dry_run else ''}Results:")
print(f" Total skills: {total}")
print(f" Extracted: {fixed}")
print(f" Already OK: {skipped}")
print(f" Resource files: {files_created}")

if args.dry_run and fixed > 0:
print(f"\nRun without --dry-run to apply {fixed} extractions ({files_created} new files).")

if name == 'main': main()