Agent Skills Framework Extension
Uncertainty Quantification Patterns Skill
When to Use This Skill
Use this skill when implementing uncertainty quantification patterns patterns in your codebase.
How to Use This Skill
- Review the patterns and examples below
- Apply the relevant patterns to your implementation
- Follow the best practices outlined in this skill
Confidence scoring, uncertainty frameworks, mixture-of-experts judges, and probabilistic reasoning.
Core Capabilities
- Confidence Scoring - Quantify certainty levels
- Uncertainty Quantification - Measure decision quality
- Mixture of Experts - Multi-model consensus
- Bayesian Reasoning - Probabilistic inference
- Error Bounds - Estimate accuracy ranges
- Decision Quality - Assess recommendation strength
Confidence Scoring Framework
# scripts/confidence-scoring.py
from dataclasses import dataclass
from typing import List, Dict, Optional
from enum import Enum
class ConfidenceLevel(Enum):
VERY_HIGH = (0.9, 1.0)
HIGH = (0.7, 0.9)
MEDIUM = (0.5, 0.7)
LOW = (0.3, 0.5)
VERY_LOW = (0.0, 0.3)
@dataclass
class ConfidenceScore:
score: float # 0.0 - 1.0
level: ConfidenceLevel
factors: Dict[str, float]
explanation: str
class ConfidenceScorer:
"""Score confidence in decisions and predictions"""
def score(
self,
decision: str,
evidence: List[str],
context: Dict
) -> ConfidenceScore:
"""Calculate confidence score"""
factors = self._calculate_factors(decision, evidence, context)
score = self._aggregate_score(factors)
level = self._determine_level(score)
explanation = self._explain_score(factors, score)
return ConfidenceScore(
score=score,
level=level,
factors=factors,
explanation=explanation
)
def _calculate_factors(
self,
decision: str,
evidence: List[str],
context: Dict
) -> Dict[str, float]:
"""Calculate individual confidence factors"""
return {
'evidence_quality': self._score_evidence_quality(evidence),
'evidence_quantity': self._score_evidence_quantity(evidence),
'consistency': self._score_consistency(evidence),
'novelty': self._score_novelty(decision, context),
'clarity': self._score_clarity(decision),
}
def _score_evidence_quality(self, evidence: List[str]) -> float:
"""Score quality of evidence"""
if not evidence:
return 0.0
quality_indicators = ['research', 'documented', 'proven', 'validated']
quality_count = sum(
1 for e in evidence
if any(ind in e.lower() for ind in quality_indicators)
)
return min(quality_count / len(evidence), 1.0)
def _score_evidence_quantity(self, evidence: List[str]) -> float:
"""Score quantity of evidence"""
# More evidence = higher confidence, up to a point
count = len(evidence)
if count == 0:
return 0.0
elif count <= 2:
return 0.3
elif count <= 5:
return 0.7
else:
return 1.0
def _score_consistency(self, evidence: List[str]) -> float:
"""Score consistency of evidence"""
if len(evidence) <= 1:
return 0.5 # Neutral for single piece
# Check for contradictory keywords
contradictory_pairs = [
('yes', 'no'),
('true', 'false'),
('recommend', 'avoid'),
]
contradictions = 0
evidence_lower = [e.lower() for e in evidence]
for word1, word2 in contradictory_pairs:
has_word1 = any(word1 in e for e in evidence_lower)
has_word2 = any(word2 in e for e in evidence_lower)
if has_word1 and has_word2:
contradictions += 1
# High contradictions = low consistency
if contradictions == 0:
return 1.0
elif contradictions == 1:
return 0.6
else:
return 0.3
def _score_novelty(self, decision: str, context: Dict) -> float:
"""Score how novel/familiar the decision is"""
is_novel = context.get('is_novel', False)
return 0.3 if is_novel else 0.8
def _score_clarity(self, decision: str) -> float:
"""Score clarity of decision"""
# Simple heuristic: longer, more specific = clearer
word_count = len(decision.split())
if word_count < 5:
return 0.4
elif word_count < 20:
return 0.8
else:
return 0.6 # Too verbose can reduce clarity
def _aggregate_score(self, factors: Dict[str, float]) -> float:
"""Aggregate factor scores"""
weights = {
'evidence_quality': 0.3,
'evidence_quantity': 0.2,
'consistency': 0.25,
'novelty': 0.15,
'clarity': 0.1,
}
weighted_sum = sum(
factors.get(factor, 0.0) * weight
for factor, weight in weights.items()
)
return min(weighted_sum, 1.0)
def _determine_level(self, score: float) -> ConfidenceLevel:
"""Determine confidence level from score"""
for level in ConfidenceLevel:
low, high = level.value
if low <= score < high:
return level
return ConfidenceLevel.VERY_HIGH
def _explain_score(self, factors: Dict[str, float], score: float) -> str:
"""Generate explanation for score"""
# Find lowest factor
lowest_factor = min(factors.items(), key=lambda x: x[1])
explanation = f"Confidence: {score:.2f} "
if score >= 0.7:
explanation += "(High confidence). "
elif score >= 0.5:
explanation += "(Medium confidence). "
else:
explanation += "(Low confidence). "
explanation += f"Lowest factor: {lowest_factor[0]} ({lowest_factor[1]:.2f})"
return explanation
# Usage
scorer = ConfidenceScorer()
confidence = scorer.score(
decision="Use PostgreSQL for database",
evidence=[
"PostgreSQL is well-documented and proven at scale",
"Strong ACID compliance for data integrity",
"Extensive ecosystem and community support"
],
context={'is_novel': False}
)
print(f"Score: {confidence.score:.2f}")
print(f"Level: {confidence.level.name}")
print(f"Explanation: {confidence.explanation}")
Mixture of Experts Judge
# scripts/mixture-of-experts.py
from dataclasses import dataclass
from typing import List, Dict, Callable
import statistics
@dataclass
class ExpertOpinion:
expert_id: str
prediction: str
confidence: float
reasoning: str
@dataclass
class ConsensusResult:
final_decision: str
confidence: float
agreement_level: float
expert_opinions: List[ExpertOpinion]
explanation: str
class MixtureOfExpertsJudge:
"""Aggregate opinions from multiple expert models"""
def __init__(self):
self.experts: Dict[str, Callable] = {}
def register_expert(self, expert_id: str, expert_fn: Callable):
"""Register an expert function"""
self.experts[expert_id] = expert_fn
def consult(self, question: str, context: Dict) -> ConsensusResult:
"""Consult all experts and reach consensus"""
opinions = self._gather_opinions(question, context)
decision, agreement = self._reach_consensus(opinions)
confidence = self._calculate_confidence(opinions, agreement)
explanation = self._explain_consensus(opinions, decision, agreement)
return ConsensusResult(
final_decision=decision,
confidence=confidence,
agreement_level=agreement,
expert_opinions=opinions,
explanation=explanation
)
def _gather_opinions(
self,
question: str,
context: Dict
) -> List[ExpertOpinion]:
"""Gather opinions from all experts"""
opinions = []
for expert_id, expert_fn in self.experts.items():
try:
result = expert_fn(question, context)
opinions.append(ExpertOpinion(
expert_id=expert_id,
prediction=result['prediction'],
confidence=result['confidence'],
reasoning=result.get('reasoning', '')
))
except Exception as e:
print(f"Expert {expert_id} failed: {e}")
return opinions
def _reach_consensus(
self,
opinions: List[ExpertOpinion]
) -> tuple[str, float]:
"""Reach consensus from opinions"""
if not opinions:
return "No consensus", 0.0
# Weighted voting by confidence
votes: Dict[str, float] = {}
for opinion in opinions:
if opinion.prediction not in votes:
votes[opinion.prediction] = 0.0
votes[opinion.prediction] += opinion.confidence
# Get top prediction
if votes:
best_prediction = max(votes.items(), key=lambda x: x[1])
total_weight = sum(votes.values())
agreement = best_prediction[1] / total_weight if total_weight > 0 else 0.0
return best_prediction[0], agreement
return "No consensus", 0.0
def _calculate_confidence(
self,
opinions: List[ExpertOpinion],
agreement: float
) -> float:
"""Calculate overall confidence"""
if not opinions:
return 0.0
# Average expert confidence weighted by agreement
avg_confidence = statistics.mean(o.confidence for o in opinions)
return avg_confidence * agreement
def _explain_consensus(
self,
opinions: List[ExpertOpinion],
decision: str,
agreement: float
) -> str:
"""Explain how consensus was reached"""
support = [o for o in opinions if o.prediction == decision]
dissent = [o for o in opinions if o.prediction != decision]
explanation = f"Consensus: {decision} (agreement: {agreement:.2f})\n"
explanation += f"Supporting experts: {len(support)}/{len(opinions)}\n"
if dissent:
explanation += f"Dissenting opinions: {len(dissent)}\n"
return explanation
# Usage
judge = MixtureOfExpertsJudge()
# Register experts (simplified for example)
judge.register_expert('expert1', lambda q, c: {
'prediction': 'Use PostgreSQL',
'confidence': 0.9,
'reasoning': 'Strong ACID compliance'
})
judge.register_expert('expert2', lambda q, c: {
'prediction': 'Use PostgreSQL',
'confidence': 0.85,
'reasoning': 'Excellent tooling'
})
judge.register_expert('expert3', lambda q, c: {
'prediction': 'Use MongoDB',
'confidence': 0.7,
'reasoning': 'Better for flexible schema'
})
result = judge.consult("What database should we use?", {})
print(f"Decision: {result.final_decision}")
print(f"Confidence: {result.confidence:.2f}")
print(f"Agreement: {result.agreement_level:.2f}")
print(result.explanation)
Bayesian Uncertainty Estimator
// scripts/bayesian-uncertainty.ts
interface BayesianUpdate {
prior: number;
likelihood: number;
posterior: number;
evidence: string;
}
class BayesianUncertaintyEstimator {
/**
* Update belief based on new evidence
*/
updateBelief(
priorProbability: number,
evidenceStrength: number,
evidenceSupports: boolean
): BayesianUpdate {
// P(H|E) = P(E|H) * P(H) / P(E)
// Simplified Bayesian update
const likelihood = evidenceSupports ? evidenceStrength : (1 - evidenceStrength);
// Simple update formula
const posterior = this.bayesianUpdate(priorProbability, likelihood);
return {
prior: priorProbability,
likelihood,
posterior,
evidence: evidenceSupports ? 'supports' : 'opposes'
};
}
private bayesianUpdate(prior: number, likelihood: number): number {
// Simplified Bayesian update
const numerator = likelihood * prior;
const denominator = numerator + (1 - likelihood) * (1 - prior);
return denominator > 0 ? numerator / denominator : prior;
}
/**
* Sequential updates with multiple evidence
*/
sequentialUpdate(
initialPrior: number,
evidence: Array<{ strength: number; supports: boolean }>
): number {
let current = initialPrior;
for (const e of evidence) {
const update = this.updateBelief(current, e.strength, e.supports);
current = update.posterior;
}
return current;
}
/**
* Estimate uncertainty bounds
*/
estimateBounds(
estimate: number,
confidenceLevel: number = 0.95
): { lower: number; upper: number } {
// Simplified confidence interval
const margin = (1 - confidenceLevel) / 2;
return {
lower: Math.max(0, estimate - margin),
upper: Math.min(1, estimate + margin)
};
}
}
// Usage
const estimator = new BayesianUncertaintyEstimator();
// Start with 50% prior belief
let belief = 0.5;
// Update with evidence
const update1 = estimator.updateBelief(belief, 0.8, true);
console.log(`After evidence 1: ${update1.posterior.toFixed(2)}`);
// Sequential updates
const final = estimator.sequentialUpdate(0.5, [
{ strength: 0.8, supports: true },
{ strength: 0.7, supports: true },
{ strength: 0.6, supports: false }
]);
console.log(`Final belief: ${final.toFixed(2)}`);
// Estimate bounds
const bounds = estimator.estimateBounds(final, 0.95);
console.log(`Bounds: [${bounds.lower.toFixed(2)}, ${bounds.upper.toFixed(2)}]`);
Usage Examples
Confidence Scoring
Apply uncertainty-quantification-patterns skill to score confidence in technical recommendation
Mixture of Experts
Apply uncertainty-quantification-patterns skill to aggregate multiple model predictions with consensus
Bayesian Updates
Apply uncertainty-quantification-patterns skill to update beliefs based on sequential evidence
Integration Points
- novelty-detection-patterns - Situation confidence
- prompt-analysis-patterns - Request uncertainty
- research-patterns - Source confidence
Success Output
When successful, this skill MUST output:
✅ SKILL COMPLETE: uncertainty-quantification-patterns
Completed:
- [x] Confidence score calculated (X.XX / 1.0)
- [x] Confidence level determined (VERY_HIGH|HIGH|MEDIUM|LOW|VERY_LOW)
- [x] Contributing factors scored (evidence quality, quantity, consistency, novelty, clarity)
- [x] Mixture of experts consulted (N experts, M consensus)
- [x] Bayesian updates applied (prior → posterior with evidence)
- [x] Uncertainty bounds estimated (95% confidence interval)
Outputs:
- Confidence Score: X.XX (Level: HIGH)
- Factors:
- Evidence quality: X.XX
- Evidence quantity: X.XX
- Consistency: X.XX
- Novelty: X.XX
- Clarity: X.XX
- Expert Consensus: "Decision" (agreement: X.XX)
- Uncertainty Bounds: [X.XX, X.XX] (95% CI)
Explanation:
- Lowest factor: {factor_name} (score: X.XX)
- Recommendation strength: {HIGH|MEDIUM|LOW}
Completion Checklist
Before marking this skill as complete, verify:
- Confidence score calculated using all 5 factors
- Confidence level mapped correctly (score → VERY_HIGH/HIGH/MEDIUM/LOW/VERY_LOW)
- Evidence quality assessed (research, documented, validated sources)
- Evidence consistency checked (no contradictions detected)
- Mixture of experts consulted (multiple models/perspectives)
- Bayesian updates applied (prior beliefs updated with evidence)
- Uncertainty bounds estimated (confidence interval calculated)
- Explanation generated (lowest factor identified)
- Decision quality assessed (recommendation strength determined)
Failure Indicators
This skill has FAILED if:
- ❌ Confidence score calculated without evidence (no basis)
- ❌ Confidence level doesn't match score (mapping error)
- ❌ Contradictory evidence not detected (consistency check failed)
- ❌ Single expert opinion presented as consensus
- ❌ Bayesian update produced invalid probability (>1.0 or <0.0)
- ❌ Uncertainty bounds wider than [0.0, 1.0] (invalid interval)
- ❌ No explanation provided for low confidence
- ❌ High confidence claimed with insufficient evidence
When NOT to Use
Do NOT use this skill when:
- Simple factual lookups (no uncertainty to quantify)
- Deterministic computations (confidence always 1.0)
- Binary yes/no questions with clear answers
- Retrieving known constants or definitions
- Tasks requiring precision over probabilistic reasoning
- Real-time decision systems (overhead too high)
- Low-stakes decisions (overhead not justified)
Use alternative skills:
factual-grounding- For verifiable factslogical-inference- For deductive reasoningresearch-patterns- For source-based answers
Anti-Patterns (Avoid)
| Anti-Pattern | Problem | Solution |
|---|---|---|
| High confidence with no evidence | Overconfidence, unreliable | Require minimum evidence threshold |
| Ignoring contradictory evidence | Biased confidence scoring | Always check consistency, lower score if contradictions |
| Single factor dominance | Skewed confidence assessment | Weight all 5 factors appropriately |
| No uncertainty bounds | False precision illusion | Always provide confidence interval |
| Expert consensus without dissent tracking | Groupthink risk | Track dissenting opinions explicitly |
| Bayesian update without prior | Invalid probability | Always start with reasonable prior (0.5 if unknown) |
| Claiming certainty | Overconfidence, trust erosion | Cap confidence at 0.95 for non-deterministic decisions |
| No explanation for low confidence | Unclear what's missing | Always identify lowest factor and explain |
Principles
This skill embodies:
- #5 Eliminate Ambiguity - Explicit confidence levels, clear uncertainty bounds
- #6 Clear, Understandable, Explainable - Factor breakdown, lowest factor explanation
- #8 No Assumptions - Evidence-based confidence, not guesswork
- Trust & Transparency - Show evidence quality, admit uncertainty
- Probabilistic Reasoning - Bayesian updates, mixture of experts
- Humility - Acknowledge limitations, provide uncertainty bounds
Full Standard: CODITECT-STANDARD-AUTOMATION.md
Version: 1.1.0 | Updated: 2026-01-04 | Author: CODITECT Team