Skip to main content

Agent Skills Framework Extension

Uncertainty Quantification Patterns Skill

When to Use This Skill

Use this skill when implementing uncertainty quantification patterns patterns in your codebase.

How to Use This Skill

  1. Review the patterns and examples below
  2. Apply the relevant patterns to your implementation
  3. Follow the best practices outlined in this skill

Confidence scoring, uncertainty frameworks, mixture-of-experts judges, and probabilistic reasoning.

Core Capabilities

  1. Confidence Scoring - Quantify certainty levels
  2. Uncertainty Quantification - Measure decision quality
  3. Mixture of Experts - Multi-model consensus
  4. Bayesian Reasoning - Probabilistic inference
  5. Error Bounds - Estimate accuracy ranges
  6. Decision Quality - Assess recommendation strength

Confidence Scoring Framework

# scripts/confidence-scoring.py
from dataclasses import dataclass
from typing import List, Dict, Optional
from enum import Enum

class ConfidenceLevel(Enum):
VERY_HIGH = (0.9, 1.0)
HIGH = (0.7, 0.9)
MEDIUM = (0.5, 0.7)
LOW = (0.3, 0.5)
VERY_LOW = (0.0, 0.3)

@dataclass
class ConfidenceScore:
score: float # 0.0 - 1.0
level: ConfidenceLevel
factors: Dict[str, float]
explanation: str

class ConfidenceScorer:
"""Score confidence in decisions and predictions"""

def score(
self,
decision: str,
evidence: List[str],
context: Dict
) -> ConfidenceScore:
"""Calculate confidence score"""
factors = self._calculate_factors(decision, evidence, context)
score = self._aggregate_score(factors)
level = self._determine_level(score)
explanation = self._explain_score(factors, score)

return ConfidenceScore(
score=score,
level=level,
factors=factors,
explanation=explanation
)

def _calculate_factors(
self,
decision: str,
evidence: List[str],
context: Dict
) -> Dict[str, float]:
"""Calculate individual confidence factors"""
return {
'evidence_quality': self._score_evidence_quality(evidence),
'evidence_quantity': self._score_evidence_quantity(evidence),
'consistency': self._score_consistency(evidence),
'novelty': self._score_novelty(decision, context),
'clarity': self._score_clarity(decision),
}

def _score_evidence_quality(self, evidence: List[str]) -> float:
"""Score quality of evidence"""
if not evidence:
return 0.0

quality_indicators = ['research', 'documented', 'proven', 'validated']
quality_count = sum(
1 for e in evidence
if any(ind in e.lower() for ind in quality_indicators)
)

return min(quality_count / len(evidence), 1.0)

def _score_evidence_quantity(self, evidence: List[str]) -> float:
"""Score quantity of evidence"""
# More evidence = higher confidence, up to a point
count = len(evidence)
if count == 0:
return 0.0
elif count <= 2:
return 0.3
elif count <= 5:
return 0.7
else:
return 1.0

def _score_consistency(self, evidence: List[str]) -> float:
"""Score consistency of evidence"""
if len(evidence) <= 1:
return 0.5 # Neutral for single piece

# Check for contradictory keywords
contradictory_pairs = [
('yes', 'no'),
('true', 'false'),
('recommend', 'avoid'),
]

contradictions = 0
evidence_lower = [e.lower() for e in evidence]

for word1, word2 in contradictory_pairs:
has_word1 = any(word1 in e for e in evidence_lower)
has_word2 = any(word2 in e for e in evidence_lower)
if has_word1 and has_word2:
contradictions += 1

# High contradictions = low consistency
if contradictions == 0:
return 1.0
elif contradictions == 1:
return 0.6
else:
return 0.3

def _score_novelty(self, decision: str, context: Dict) -> float:
"""Score how novel/familiar the decision is"""
is_novel = context.get('is_novel', False)
return 0.3 if is_novel else 0.8

def _score_clarity(self, decision: str) -> float:
"""Score clarity of decision"""
# Simple heuristic: longer, more specific = clearer
word_count = len(decision.split())
if word_count < 5:
return 0.4
elif word_count < 20:
return 0.8
else:
return 0.6 # Too verbose can reduce clarity

def _aggregate_score(self, factors: Dict[str, float]) -> float:
"""Aggregate factor scores"""
weights = {
'evidence_quality': 0.3,
'evidence_quantity': 0.2,
'consistency': 0.25,
'novelty': 0.15,
'clarity': 0.1,
}

weighted_sum = sum(
factors.get(factor, 0.0) * weight
for factor, weight in weights.items()
)

return min(weighted_sum, 1.0)

def _determine_level(self, score: float) -> ConfidenceLevel:
"""Determine confidence level from score"""
for level in ConfidenceLevel:
low, high = level.value
if low <= score < high:
return level
return ConfidenceLevel.VERY_HIGH

def _explain_score(self, factors: Dict[str, float], score: float) -> str:
"""Generate explanation for score"""
# Find lowest factor
lowest_factor = min(factors.items(), key=lambda x: x[1])

explanation = f"Confidence: {score:.2f} "

if score >= 0.7:
explanation += "(High confidence). "
elif score >= 0.5:
explanation += "(Medium confidence). "
else:
explanation += "(Low confidence). "

explanation += f"Lowest factor: {lowest_factor[0]} ({lowest_factor[1]:.2f})"

return explanation

# Usage
scorer = ConfidenceScorer()

confidence = scorer.score(
decision="Use PostgreSQL for database",
evidence=[
"PostgreSQL is well-documented and proven at scale",
"Strong ACID compliance for data integrity",
"Extensive ecosystem and community support"
],
context={'is_novel': False}
)

print(f"Score: {confidence.score:.2f}")
print(f"Level: {confidence.level.name}")
print(f"Explanation: {confidence.explanation}")

Mixture of Experts Judge

# scripts/mixture-of-experts.py
from dataclasses import dataclass
from typing import List, Dict, Callable
import statistics

@dataclass
class ExpertOpinion:
expert_id: str
prediction: str
confidence: float
reasoning: str

@dataclass
class ConsensusResult:
final_decision: str
confidence: float
agreement_level: float
expert_opinions: List[ExpertOpinion]
explanation: str

class MixtureOfExpertsJudge:
"""Aggregate opinions from multiple expert models"""

def __init__(self):
self.experts: Dict[str, Callable] = {}

def register_expert(self, expert_id: str, expert_fn: Callable):
"""Register an expert function"""
self.experts[expert_id] = expert_fn

def consult(self, question: str, context: Dict) -> ConsensusResult:
"""Consult all experts and reach consensus"""
opinions = self._gather_opinions(question, context)
decision, agreement = self._reach_consensus(opinions)
confidence = self._calculate_confidence(opinions, agreement)
explanation = self._explain_consensus(opinions, decision, agreement)

return ConsensusResult(
final_decision=decision,
confidence=confidence,
agreement_level=agreement,
expert_opinions=opinions,
explanation=explanation
)

def _gather_opinions(
self,
question: str,
context: Dict
) -> List[ExpertOpinion]:
"""Gather opinions from all experts"""
opinions = []

for expert_id, expert_fn in self.experts.items():
try:
result = expert_fn(question, context)
opinions.append(ExpertOpinion(
expert_id=expert_id,
prediction=result['prediction'],
confidence=result['confidence'],
reasoning=result.get('reasoning', '')
))
except Exception as e:
print(f"Expert {expert_id} failed: {e}")

return opinions

def _reach_consensus(
self,
opinions: List[ExpertOpinion]
) -> tuple[str, float]:
"""Reach consensus from opinions"""
if not opinions:
return "No consensus", 0.0

# Weighted voting by confidence
votes: Dict[str, float] = {}
for opinion in opinions:
if opinion.prediction not in votes:
votes[opinion.prediction] = 0.0
votes[opinion.prediction] += opinion.confidence

# Get top prediction
if votes:
best_prediction = max(votes.items(), key=lambda x: x[1])
total_weight = sum(votes.values())
agreement = best_prediction[1] / total_weight if total_weight > 0 else 0.0

return best_prediction[0], agreement

return "No consensus", 0.0

def _calculate_confidence(
self,
opinions: List[ExpertOpinion],
agreement: float
) -> float:
"""Calculate overall confidence"""
if not opinions:
return 0.0

# Average expert confidence weighted by agreement
avg_confidence = statistics.mean(o.confidence for o in opinions)
return avg_confidence * agreement

def _explain_consensus(
self,
opinions: List[ExpertOpinion],
decision: str,
agreement: float
) -> str:
"""Explain how consensus was reached"""
support = [o for o in opinions if o.prediction == decision]
dissent = [o for o in opinions if o.prediction != decision]

explanation = f"Consensus: {decision} (agreement: {agreement:.2f})\n"
explanation += f"Supporting experts: {len(support)}/{len(opinions)}\n"

if dissent:
explanation += f"Dissenting opinions: {len(dissent)}\n"

return explanation

# Usage
judge = MixtureOfExpertsJudge()

# Register experts (simplified for example)
judge.register_expert('expert1', lambda q, c: {
'prediction': 'Use PostgreSQL',
'confidence': 0.9,
'reasoning': 'Strong ACID compliance'
})

judge.register_expert('expert2', lambda q, c: {
'prediction': 'Use PostgreSQL',
'confidence': 0.85,
'reasoning': 'Excellent tooling'
})

judge.register_expert('expert3', lambda q, c: {
'prediction': 'Use MongoDB',
'confidence': 0.7,
'reasoning': 'Better for flexible schema'
})

result = judge.consult("What database should we use?", {})

print(f"Decision: {result.final_decision}")
print(f"Confidence: {result.confidence:.2f}")
print(f"Agreement: {result.agreement_level:.2f}")
print(result.explanation)

Bayesian Uncertainty Estimator

// scripts/bayesian-uncertainty.ts
interface BayesianUpdate {
prior: number;
likelihood: number;
posterior: number;
evidence: string;
}

class BayesianUncertaintyEstimator {
/**
* Update belief based on new evidence
*/
updateBelief(
priorProbability: number,
evidenceStrength: number,
evidenceSupports: boolean
): BayesianUpdate {
// P(H|E) = P(E|H) * P(H) / P(E)
// Simplified Bayesian update

const likelihood = evidenceSupports ? evidenceStrength : (1 - evidenceStrength);

// Simple update formula
const posterior = this.bayesianUpdate(priorProbability, likelihood);

return {
prior: priorProbability,
likelihood,
posterior,
evidence: evidenceSupports ? 'supports' : 'opposes'
};
}

private bayesianUpdate(prior: number, likelihood: number): number {
// Simplified Bayesian update
const numerator = likelihood * prior;
const denominator = numerator + (1 - likelihood) * (1 - prior);

return denominator > 0 ? numerator / denominator : prior;
}

/**
* Sequential updates with multiple evidence
*/
sequentialUpdate(
initialPrior: number,
evidence: Array<{ strength: number; supports: boolean }>
): number {
let current = initialPrior;

for (const e of evidence) {
const update = this.updateBelief(current, e.strength, e.supports);
current = update.posterior;
}

return current;
}

/**
* Estimate uncertainty bounds
*/
estimateBounds(
estimate: number,
confidenceLevel: number = 0.95
): { lower: number; upper: number } {
// Simplified confidence interval
const margin = (1 - confidenceLevel) / 2;

return {
lower: Math.max(0, estimate - margin),
upper: Math.min(1, estimate + margin)
};
}
}

// Usage
const estimator = new BayesianUncertaintyEstimator();

// Start with 50% prior belief
let belief = 0.5;

// Update with evidence
const update1 = estimator.updateBelief(belief, 0.8, true);
console.log(`After evidence 1: ${update1.posterior.toFixed(2)}`);

// Sequential updates
const final = estimator.sequentialUpdate(0.5, [
{ strength: 0.8, supports: true },
{ strength: 0.7, supports: true },
{ strength: 0.6, supports: false }
]);

console.log(`Final belief: ${final.toFixed(2)}`);

// Estimate bounds
const bounds = estimator.estimateBounds(final, 0.95);
console.log(`Bounds: [${bounds.lower.toFixed(2)}, ${bounds.upper.toFixed(2)}]`);

Usage Examples

Confidence Scoring

Apply uncertainty-quantification-patterns skill to score confidence in technical recommendation

Mixture of Experts

Apply uncertainty-quantification-patterns skill to aggregate multiple model predictions with consensus

Bayesian Updates

Apply uncertainty-quantification-patterns skill to update beliefs based on sequential evidence

Integration Points

  • novelty-detection-patterns - Situation confidence
  • prompt-analysis-patterns - Request uncertainty
  • research-patterns - Source confidence

Success Output

When successful, this skill MUST output:

✅ SKILL COMPLETE: uncertainty-quantification-patterns

Completed:
- [x] Confidence score calculated (X.XX / 1.0)
- [x] Confidence level determined (VERY_HIGH|HIGH|MEDIUM|LOW|VERY_LOW)
- [x] Contributing factors scored (evidence quality, quantity, consistency, novelty, clarity)
- [x] Mixture of experts consulted (N experts, M consensus)
- [x] Bayesian updates applied (prior → posterior with evidence)
- [x] Uncertainty bounds estimated (95% confidence interval)

Outputs:
- Confidence Score: X.XX (Level: HIGH)
- Factors:
- Evidence quality: X.XX
- Evidence quantity: X.XX
- Consistency: X.XX
- Novelty: X.XX
- Clarity: X.XX
- Expert Consensus: "Decision" (agreement: X.XX)
- Uncertainty Bounds: [X.XX, X.XX] (95% CI)

Explanation:
- Lowest factor: {factor_name} (score: X.XX)
- Recommendation strength: {HIGH|MEDIUM|LOW}

Completion Checklist

Before marking this skill as complete, verify:

  • Confidence score calculated using all 5 factors
  • Confidence level mapped correctly (score → VERY_HIGH/HIGH/MEDIUM/LOW/VERY_LOW)
  • Evidence quality assessed (research, documented, validated sources)
  • Evidence consistency checked (no contradictions detected)
  • Mixture of experts consulted (multiple models/perspectives)
  • Bayesian updates applied (prior beliefs updated with evidence)
  • Uncertainty bounds estimated (confidence interval calculated)
  • Explanation generated (lowest factor identified)
  • Decision quality assessed (recommendation strength determined)

Failure Indicators

This skill has FAILED if:

  • ❌ Confidence score calculated without evidence (no basis)
  • ❌ Confidence level doesn't match score (mapping error)
  • ❌ Contradictory evidence not detected (consistency check failed)
  • ❌ Single expert opinion presented as consensus
  • ❌ Bayesian update produced invalid probability (>1.0 or <0.0)
  • ❌ Uncertainty bounds wider than [0.0, 1.0] (invalid interval)
  • ❌ No explanation provided for low confidence
  • ❌ High confidence claimed with insufficient evidence

When NOT to Use

Do NOT use this skill when:

  • Simple factual lookups (no uncertainty to quantify)
  • Deterministic computations (confidence always 1.0)
  • Binary yes/no questions with clear answers
  • Retrieving known constants or definitions
  • Tasks requiring precision over probabilistic reasoning
  • Real-time decision systems (overhead too high)
  • Low-stakes decisions (overhead not justified)

Use alternative skills:

  • factual-grounding - For verifiable facts
  • logical-inference - For deductive reasoning
  • research-patterns - For source-based answers

Anti-Patterns (Avoid)

Anti-PatternProblemSolution
High confidence with no evidenceOverconfidence, unreliableRequire minimum evidence threshold
Ignoring contradictory evidenceBiased confidence scoringAlways check consistency, lower score if contradictions
Single factor dominanceSkewed confidence assessmentWeight all 5 factors appropriately
No uncertainty boundsFalse precision illusionAlways provide confidence interval
Expert consensus without dissent trackingGroupthink riskTrack dissenting opinions explicitly
Bayesian update without priorInvalid probabilityAlways start with reasonable prior (0.5 if unknown)
Claiming certaintyOverconfidence, trust erosionCap confidence at 0.95 for non-deterministic decisions
No explanation for low confidenceUnclear what's missingAlways identify lowest factor and explain

Principles

This skill embodies:

  • #5 Eliminate Ambiguity - Explicit confidence levels, clear uncertainty bounds
  • #6 Clear, Understandable, Explainable - Factor breakdown, lowest factor explanation
  • #8 No Assumptions - Evidence-based confidence, not guesswork
  • Trust & Transparency - Show evidence quality, admit uncertainty
  • Probabilistic Reasoning - Bayesian updates, mixture of experts
  • Humility - Acknowledge limitations, provide uncertainty bounds

Full Standard: CODITECT-STANDARD-AUTOMATION.md


Version: 1.1.0 | Updated: 2026-01-04 | Author: CODITECT Team