scripts-categorizer
#!/usr/bin/env python3 """ CODITECT Smart Categorizer
Multi-signal content categorization with confidence scoring. Improvements over keyword-only matching:
- URL structure analysis (3 points)
- Title matching (2 points)
- Heading analysis (2 points)
- Content keyword density (1 point)
- ML-assisted classification (optional)
Author: CODITECT Version: 1.0.0 """
import re from dataclasses import dataclass, field from typing import Optional from collections import Counter
@dataclass class CategoryResult: """Result of categorization.""" category: str confidence: float signals: dict[str, int] = field(default_factory=dict)
@property
def is_confident(self) -> bool:
return self.confidence >= 0.6
@dataclass class CategoryConfig: """Category definition with keywords.""" name: str keywords: list[str] url_patterns: list[str] = field(default_factory=list) priority: int = 0
class SmartCategorizer: """ CODITECT Smart Categorizer
Multi-signal scoring system:
- URL match: 3 points
- Title match: 2 points
- H1/H2 heading match: 2 points
- Content keyword: 1 point per occurrence (max 3)
Minimum threshold: 2 points for categorization
"""
DEFAULT_CATEGORIES = {
"getting_started": CategoryConfig(
name="getting_started",
keywords=["getting started", "quickstart", "quick start", "installation",
"install", "setup", "introduction", "intro", "first steps",
"prerequisites", "requirements", "hello world"],
url_patterns=["/getting-started", "/quickstart", "/install", "/intro",
"/start", "/setup"],
priority=10
),
"tutorials": CategoryConfig(
name="tutorials",
keywords=["tutorial", "guide", "walkthrough", "step by step",
"how to", "howto", "example", "learn", "building",
"creating", "making"],
url_patterns=["/tutorial", "/guide", "/learn", "/howto", "/examples"],
priority=8
),
"api_reference": CategoryConfig(
name="api_reference",
keywords=["api", "reference", "methods", "functions", "class",
"interface", "module", "package", "endpoint", "parameters",
"returns", "arguments", "signature"],
url_patterns=["/api", "/reference", "/ref", "/docs/api", "/sdk"],
priority=7
),
"concepts": CategoryConfig(
name="concepts",
keywords=["concept", "architecture", "overview", "understanding",
"fundamentals", "basics", "theory", "how it works",
"design", "principles", "philosophy"],
url_patterns=["/concepts", "/architecture", "/overview", "/fundamentals"],
priority=6
),
"advanced": CategoryConfig(
name="advanced",
keywords=["advanced", "optimization", "performance", "best practice",
"patterns", "tips", "tricks", "deep dive", "internals",
"scaling", "production"],
url_patterns=["/advanced", "/performance", "/best-practices", "/patterns"],
priority=5
),
"configuration": CategoryConfig(
name="configuration",
keywords=["configuration", "config", "settings", "options",
"environment", "variables", "customize", "parameters"],
url_patterns=["/config", "/configuration", "/settings", "/options"],
priority=4
),
"troubleshooting": CategoryConfig(
name="troubleshooting",
keywords=["troubleshooting", "faq", "debug", "debugging", "error",
"issue", "problem", "fix", "solution", "help",
"common issues", "known issues"],
url_patterns=["/faq", "/troubleshooting", "/debug", "/help", "/issues"],
priority=3
),
"deployment": CategoryConfig(
name="deployment",
keywords=["deployment", "deploy", "production", "hosting",
"docker", "kubernetes", "aws", "cloud", "ci/cd",
"release", "publish"],
url_patterns=["/deployment", "/deploy", "/hosting", "/production"],
priority=4
),
"testing": CategoryConfig(
name="testing",
keywords=["testing", "test", "unit test", "integration",
"e2e", "mock", "fixture", "coverage", "tdd"],
url_patterns=["/testing", "/test", "/tests"],
priority=4
),
"migration": CategoryConfig(
name="migration",
keywords=["migration", "upgrade", "migrate", "updating",
"changelog", "breaking changes", "version"],
url_patterns=["/migration", "/upgrade", "/changelog"],
priority=2
)
}
SIGNAL_WEIGHTS = {
"url": 3,
"title": 2,
"heading": 2,
"content": 1 # per occurrence, max 3
}
MIN_THRESHOLD = 2
def __init__(
self,
custom_categories: Optional[dict[str, CategoryConfig]] = None,
min_threshold: int = 2
):
self.categories = {**self.DEFAULT_CATEGORIES}
if custom_categories:
self.categories.update(custom_categories)
self.min_threshold = min_threshold
def categorize(
self,
url: str,
title: str,
headings: list[str],
content: str
) -> CategoryResult:
"""
Categorize content using multi-signal scoring.
Args:
url: Page URL
title: Page title
headings: List of headings (H1, H2, H3)
content: Page text content
Returns:
CategoryResult with category, confidence, and signal breakdown
"""
scores = {}
for cat_name, cat_config in self.categories.items():
signals = {}
total_score = 0
# Signal 1: URL pattern match (3 points)
url_lower = url.lower()
if any(pattern in url_lower for pattern in cat_config.url_patterns):
signals["url"] = self.SIGNAL_WEIGHTS["url"]
total_score += signals["url"]
# Signal 2: Title match (2 points)
title_lower = title.lower()
if any(kw in title_lower for kw in cat_config.keywords):
signals["title"] = self.SIGNAL_WEIGHTS["title"]
total_score += signals["title"]
# Signal 3: Heading match (2 points)
headings_text = " ".join(h.lower() for h in headings)
if any(kw in headings_text for kw in cat_config.keywords):
signals["heading"] = self.SIGNAL_WEIGHTS["heading"]
total_score += signals["heading"]
# Signal 4: Content keyword density (1 point each, max 3)
content_lower = content.lower()
keyword_hits = sum(1 for kw in cat_config.keywords if kw in content_lower)
if keyword_hits > 0:
signals["content"] = min(keyword_hits, 3) * self.SIGNAL_WEIGHTS["content"]
total_score += signals["content"]
if total_score >= self.min_threshold:
scores[cat_name] = {
"score": total_score,
"signals": signals,
"priority": cat_config.priority
}
if not scores:
return CategoryResult(
category="other",
confidence=0.0,
signals={}
)
# Select best category (highest score, then priority)
best_category = max(
scores.keys(),
key=lambda k: (scores[k]["score"], scores[k]["priority"])
)
best_score = scores[best_category]["score"]
# Calculate confidence (0-1 scale, max possible is 10)
confidence = min(best_score / 10, 1.0)
return CategoryResult(
category=best_category,
confidence=confidence,
signals=scores[best_category]["signals"]
)
def categorize_batch(
self,
pages: list[dict]
) -> dict[str, list[dict]]:
"""
Categorize multiple pages and group by category.
Args:
pages: List of page dicts with url, title, headings, content
Returns:
Dict mapping category names to lists of pages
"""
categorized = {cat: [] for cat in self.categories.keys()}
categorized["other"] = []
for page in pages:
result = self.categorize(
url=page.get("url", ""),
title=page.get("title", ""),
headings=page.get("headings", []),
content=page.get("content", "")
)
page["category"] = result.category
page["category_confidence"] = result.confidence
page["category_signals"] = result.signals
categorized[result.category].append(page)
return categorized
def infer_categories_from_urls(self, urls: list[str]) -> dict[str, list[str]]:
"""
Auto-infer category structure from URL patterns.
Useful for sites without predefined categories.
"""
# Extract URL path segments
segments = Counter()
for url in urls:
from urllib.parse import urlparse
parsed = urlparse(url)
path_parts = [p for p in parsed.path.split('/') if p]
if path_parts:
segments[path_parts[0]] += 1
# Top segments become categories
inferred = {}
for segment, count in segments.most_common(10):
if count >= 3: # Minimum pages to create category
inferred[segment] = [
url for url in urls if f"/{segment}/" in url or f"/{segment}" in url
]
return inferred
def get_category_stats(self, categorized: dict[str, list]) -> dict:
"""Get statistics about categorization results."""
total = sum(len(pages) for pages in categorized.values())
stats = {
"total_pages": total,
"categories_used": len([c for c, p in categorized.items() if p]),
"distribution": {},
"avg_confidence": 0.0
}
confidences = []
for cat, pages in categorized.items():
if pages:
stats["distribution"][cat] = {
"count": len(pages),
"percentage": (len(pages) / total * 100) if total > 0 else 0
}
confidences.extend(p.get("category_confidence", 0) for p in pages)
if confidences:
stats["avg_confidence"] = sum(confidences) / len(confidences)
return stats