Theory Notebook
Converted from
theory.ipynbfor web reading.
Error Analysis and Ablations
Error analysis turns aggregate scores into failure structure; ablations test which component actually caused an improvement.
This notebook is the executable companion to notes.md. It uses synthetic data so the evaluation mathematics can run anywhere without external files.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import math
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
def header(title):
print("\n" + "=" * 80)
print(title)
print("=" * 80)
def check_true(condition, message):
print(f"{'PASS' if bool(condition) else 'FAIL'} - {message}")
assert bool(condition)
def check_close(actual, expected, tol=1e-8, message="values close"):
ok = abs(actual - expected) <= tol
print(f"{'PASS' if ok else 'FAIL'} - {message}: actual={actual:.6f}, expected={expected:.6f}")
assert ok
def bootstrap_mean_ci(values, B=1000, alpha=0.05):
values = np.asarray(values, dtype=float)
idx = np.random.randint(0, len(values), size=(B, len(values)))
boot = values[idx].mean(axis=1)
lo, hi = np.quantile(boot, [alpha / 2, 1 - alpha / 2])
return float(values.mean()), float(lo), float(hi)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
print("Evaluation helper functions loaded.")
Demo 1: Aggregate metrics hide failure modes
This cell studies Aggregate metrics hide failure modes through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 5
header("Demo 1 - Aggregate metrics hide failure modes: confusion matrix")
y_true = np.random.randint(0, 3, size=600)
noise = np.random.rand(600) < 0.18
y_pred = y_true.copy()
y_pred[noise] = np.random.randint(0, 3, size=noise.sum())
cm = np.zeros((3, 3), dtype=int)
for t, p in zip(y_true, y_pred):
cm[t, p] += 1
fig, ax = plt.subplots(figsize=(7, 6))
im = ax.imshow(cm, cmap="viridis")
fig.colorbar(im, ax=ax, label="count")
ax.set_title("Confusion matrix")
ax.set_xlabel("Predicted")
ax.set_ylabel("True")
for i in range(3):
for j in range(3):
ax.text(j, i, str(cm[i, j]), ha="center", va="center", color="white")
fig.tight_layout()
plt.show()
print(cm)
check_true(cm.trace() <= cm.sum(), "diagonal cannot exceed total count")
Demo 2: Failures as structured data
This cell studies Failures as structured data through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 7
header("Demo 2 - Failures as structured data: slice metrics")
slices = np.array(["short", "long", "code", "math", "multilingual"])
acc = np.array([0.82, 0.71, 0.66, 0.58, 0.62])
n = np.array([400, 260, 180, 160, 140])
se = np.sqrt(acc * (1 - acc) / n)
for name, a, e in zip(slices, acc, se):
print(f"slice={name:12s} accuracy={a:.3f} +/- {1.96*e:.3f}")
check_true(acc.min() < acc.max(), "slices reveal heterogeneous performance")
Demo 3: Ablations as causal probes
This cell studies Ablations as causal probes through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 9
header("Demo 3 - Ablations as causal probes: ablation effects")
base = 0.64
effects = {"retrieval": 0.09, "tool": 0.04, "rerank": 0.03, "cot": 0.02}
for name, eff in effects.items():
print(f"remove {name:9s} -> expected score {base + sum(effects.values()) - eff:.3f}, effect={eff:.3f}")
total = base + sum(effects.values())
print(f"full system score={total:.3f}")
check_true(total > base, "full system improves over base in synthetic ablation")
Demo 4: Debugging without benchmark overfitting
This cell studies Debugging without benchmark overfitting through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 11
header("Demo 4 - Debugging without benchmark overfitting: factorial interaction")
A = np.array([0, 0, 1, 1])
B = np.array([0, 1, 0, 1])
y = 0.60 + 0.05 * A + 0.03 * B + 0.08 * A * B
interaction = y[3] - y[2] - y[1] + y[0]
for a, b, val in zip(A, B, y):
print(f"A={a}, B={b}, score={val:.3f}")
print(f"interaction={interaction:.3f}")
check_close(float(interaction), 0.08, tol=1e-8, message="interaction recovered")
Demo 5: From one bug to a regression suite
This cell studies From one bug to a regression suite through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 13
header("Demo 5 - From one bug to a regression suite: bootstrap uncertainty")
values = np.clip(np.random.normal(0.68, 0.18, size=500), 0, 1)
mean, lo, hi = bootstrap_mean_ci(values, B=800)
print(f"mean={mean:.3f}, bootstrap 95% CI=[{lo:.3f}, {hi:.3f}]")
check_true(hi > lo, "bootstrap interval has positive width")
Demo 6: Error set
This cell studies Error set through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 15
header("Demo 6 - Error set: metric comparison plot")
names = np.array(["base", "prompt", "retrieval", "tool"])
scores = np.array([0.62, 0.68, 0.73, 0.70])
fig, ax = plt.subplots()
bars = ax.bar(names, scores, color=[COLORS["primary"], COLORS["secondary"], COLORS["tertiary"], COLORS["highlight"]])
ax.set_title("Evaluation metric across system variants")
ax.set_xlabel("System variant")
ax.set_ylabel("Score")
ax.set_ylim(0, 1)
for bar, val in zip(bars, scores):
ax.text(bar.get_x() + bar.get_width()/2, val + 0.02, f"{val:.2f}", ha="center")
fig.tight_layout()
plt.show()
print("plotted metric comparison for four variants")
check_true(scores.max() > scores.min(), "variants differ in measured score")
Demo 7: Confusion matrix
This cell studies Confusion matrix through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 17
header("Demo 7 - Confusion matrix: finite-sample accuracy interval")
n = 600
y = (np.random.rand(n) < 0.74).astype(float)
mean = y.mean()
se = np.sqrt(mean * (1 - mean) / n)
lo, hi = mean - 1.96 * se, mean + 1.96 * se
print(f"accuracy={mean:.3f}, 95% CI=[{lo:.3f}, {hi:.3f}], n={n}")
check_true(lo <= mean <= hi, "point estimate lies inside its interval")
Demo 8: Slice and subgroup
This cell studies Slice and subgroup through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 19
header("Demo 8 - Slice and subgroup: worst-group risk")
groups = np.array(["A", "B", "C", "D"])
counts = np.array([500, 300, 120, 80])
risks = np.array([0.08, 0.11, 0.19, 0.27])
overall = np.average(risks, weights=counts)
worst = risks.max()
for g, n, r in zip(groups, counts, risks):
print(f"group={g}, n={n}, risk={r:.3f}")
print(f"overall risk={overall:.3f}, worst-group risk={worst:.3f}")
check_true(worst >= overall, "worst-group risk upper-bounds weighted average risk")
Demo 9: Counterfactual example
This cell studies Counterfactual example through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 21
header("Demo 9 - Counterfactual example: confusion matrix")
y_true = np.random.randint(0, 3, size=600)
noise = np.random.rand(600) < 0.18
y_pred = y_true.copy()
y_pred[noise] = np.random.randint(0, 3, size=noise.sum())
cm = np.zeros((3, 3), dtype=int)
for t, p in zip(y_true, y_pred):
cm[t, p] += 1
fig, ax = plt.subplots(figsize=(7, 6))
im = ax.imshow(cm, cmap="viridis")
fig.colorbar(im, ax=ax, label="count")
ax.set_title("Confusion matrix")
ax.set_xlabel("Predicted")
ax.set_ylabel("True")
for i in range(3):
for j in range(3):
ax.text(j, i, str(cm[i, j]), ha="center", va="center", color="white")
fig.tight_layout()
plt.show()
print(cm)
check_true(cm.trace() <= cm.sum(), "diagonal cannot exceed total count")
Demo 10: Ablation effect and interaction
This cell studies Ablation effect and interaction through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 23
header("Demo 10 - Ablation effect and interaction: slice metrics")
slices = np.array(["short", "long", "code", "math", "multilingual"])
acc = np.array([0.82, 0.71, 0.66, 0.58, 0.62])
n = np.array([400, 260, 180, 160, 140])
se = np.sqrt(acc * (1 - acc) / n)
for name, a, e in zip(slices, acc, se):
print(f"slice={name:12s} accuracy={a:.3f} +/- {1.96*e:.3f}")
check_true(acc.min() < acc.max(), "slices reveal heterogeneous performance")
Demo 11: False positives and false negatives
This cell studies False positives and false negatives through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 25
header("Demo 11 - False positives and false negatives: ablation effects")
base = 0.64
effects = {"retrieval": 0.09, "tool": 0.04, "rerank": 0.03, "cot": 0.02}
for name, eff in effects.items():
print(f"remove {name:9s} -> expected score {base + sum(effects.values()) - eff:.3f}, effect={eff:.3f}")
total = base + sum(effects.values())
print(f"full system score={total:.3f}")
check_true(total > base, "full system improves over base in synthetic ablation")
Demo 12: Hallucination and unsupported claims
This cell studies Hallucination and unsupported claims through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 27
header("Demo 12 - Hallucination and unsupported claims: factorial interaction")
A = np.array([0, 0, 1, 1])
B = np.array([0, 1, 0, 1])
y = 0.60 + 0.05 * A + 0.03 * B + 0.08 * A * B
interaction = y[3] - y[2] - y[1] + y[0]
for a, b, val in zip(A, B, y):
print(f"A={a}, B={b}, score={val:.3f}")
print(f"interaction={interaction:.3f}")
check_close(float(interaction), 0.08, tol=1e-8, message="interaction recovered")
Demo 13: Format and instruction errors
This cell studies Format and instruction errors through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 29
header("Demo 13 - Format and instruction errors: bootstrap uncertainty")
values = np.clip(np.random.normal(0.68, 0.18, size=500), 0, 1)
mean, lo, hi = bootstrap_mean_ci(values, B=800)
print(f"mean={mean:.3f}, bootstrap 95% CI=[{lo:.3f}, {hi:.3f}]")
check_true(hi > lo, "bootstrap interval has positive width")
Demo 14: Reasoning errors
This cell studies Reasoning errors through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 31
header("Demo 14 - Reasoning errors: metric comparison plot")
names = np.array(["base", "prompt", "retrieval", "tool"])
scores = np.array([0.62, 0.68, 0.73, 0.70])
fig, ax = plt.subplots()
bars = ax.bar(names, scores, color=[COLORS["primary"], COLORS["secondary"], COLORS["tertiary"], COLORS["highlight"]])
ax.set_title("Evaluation metric across system variants")
ax.set_xlabel("System variant")
ax.set_ylabel("Score")
ax.set_ylim(0, 1)
for bar, val in zip(bars, scores):
ax.text(bar.get_x() + bar.get_width()/2, val + 0.02, f"{val:.2f}", ha="center")
fig.tight_layout()
plt.show()
print("plotted metric comparison for four variants")
check_true(scores.max() > scores.min(), "variants differ in measured score")
Demo 15: Tool and retrieval failures
This cell studies Tool and retrieval failures through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 33
header("Demo 15 - Tool and retrieval failures: finite-sample accuracy interval")
n = 600
y = (np.random.rand(n) < 0.74).astype(float)
mean = y.mean()
se = np.sqrt(mean * (1 - mean) / n)
lo, hi = mean - 1.96 * se, mean + 1.96 * se
print(f"accuracy={mean:.3f}, 95% CI=[{lo:.3f}, {hi:.3f}], n={n}")
check_true(lo <= mean <= hi, "point estimate lies inside its interval")
Demo 16: Stratified metrics
This cell studies Stratified metrics through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 35
header("Demo 16 - Stratified metrics: worst-group risk")
groups = np.array(["A", "B", "C", "D"])
counts = np.array([500, 300, 120, 80])
risks = np.array([0.08, 0.11, 0.19, 0.27])
overall = np.average(risks, weights=counts)
worst = risks.max()
for g, n, r in zip(groups, counts, risks):
print(f"group={g}, n={n}, risk={r:.3f}")
print(f"overall risk={overall:.3f}, worst-group risk={worst:.3f}")
check_true(worst >= overall, "worst-group risk upper-bounds weighted average risk")
Demo 17: Subgroup confidence intervals
This cell studies Subgroup confidence intervals through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 37
header("Demo 17 - Subgroup confidence intervals: confusion matrix")
y_true = np.random.randint(0, 3, size=600)
noise = np.random.rand(600) < 0.18
y_pred = y_true.copy()
y_pred[noise] = np.random.randint(0, 3, size=noise.sum())
cm = np.zeros((3, 3), dtype=int)
for t, p in zip(y_true, y_pred):
cm[t, p] += 1
fig, ax = plt.subplots(figsize=(7, 6))
im = ax.imshow(cm, cmap="viridis")
fig.colorbar(im, ax=ax, label="count")
ax.set_title("Confusion matrix")
ax.set_xlabel("Predicted")
ax.set_ylabel("True")
for i in range(3):
for j in range(3):
ax.text(j, i, str(cm[i, j]), ha="center", va="center", color="white")
fig.tight_layout()
plt.show()
print(cm)
check_true(cm.trace() <= cm.sum(), "diagonal cannot exceed total count")
Demo 18: Multiple-testing control for slices
This cell studies Multiple-testing control for slices through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 39
header("Demo 18 - Multiple-testing control for slices: slice metrics")
slices = np.array(["short", "long", "code", "math", "multilingual"])
acc = np.array([0.82, 0.71, 0.66, 0.58, 0.62])
n = np.array([400, 260, 180, 160, 140])
se = np.sqrt(acc * (1 - acc) / n)
for name, a, e in zip(slices, acc, se):
print(f"slice={name:12s} accuracy={a:.3f} +/- {1.96*e:.3f}")
check_true(acc.min() < acc.max(), "slices reveal heterogeneous performance")
Demo 19: Prioritizing failures
This cell studies Prioritizing failures through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 41
header("Demo 19 - Prioritizing failures: ablation effects")
base = 0.64
effects = {"retrieval": 0.09, "tool": 0.04, "rerank": 0.03, "cot": 0.02}
for name, eff in effects.items():
print(f"remove {name:9s} -> expected score {base + sum(effects.values()) - eff:.3f}, effect={eff:.3f}")
total = base + sum(effects.values())
print(f"full system score={total:.3f}")
check_true(total > base, "full system improves over base in synthetic ablation")
Demo 20: Dashboard and report design
This cell studies Dashboard and report design through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 43
header("Demo 20 - Dashboard and report design: factorial interaction")
A = np.array([0, 0, 1, 1])
B = np.array([0, 1, 0, 1])
y = 0.60 + 0.05 * A + 0.03 * B + 0.08 * A * B
interaction = y[3] - y[2] - y[1] + y[0]
for a, b, val in zip(A, B, y):
print(f"A={a}, B={b}, score={val:.3f}")
print(f"interaction={interaction:.3f}")
check_close(float(interaction), 0.08, tol=1e-8, message="interaction recovered")
Demo 21: Model ablations
This cell studies Model ablations through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 45
header("Demo 21 - Model ablations: bootstrap uncertainty")
values = np.clip(np.random.normal(0.68, 0.18, size=500), 0, 1)
mean, lo, hi = bootstrap_mean_ci(values, B=800)
print(f"mean={mean:.3f}, bootstrap 95% CI=[{lo:.3f}, {hi:.3f}]")
check_true(hi > lo, "bootstrap interval has positive width")
Demo 22: Data ablations
This cell studies Data ablations through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 47
header("Demo 22 - Data ablations: metric comparison plot")
names = np.array(["base", "prompt", "retrieval", "tool"])
scores = np.array([0.62, 0.68, 0.73, 0.70])
fig, ax = plt.subplots()
bars = ax.bar(names, scores, color=[COLORS["primary"], COLORS["secondary"], COLORS["tertiary"], COLORS["highlight"]])
ax.set_title("Evaluation metric across system variants")
ax.set_xlabel("System variant")
ax.set_ylabel("Score")
ax.set_ylim(0, 1)
for bar, val in zip(bars, scores):
ax.text(bar.get_x() + bar.get_width()/2, val + 0.02, f"{val:.2f}", ha="center")
fig.tight_layout()
plt.show()
print("plotted metric comparison for four variants")
check_true(scores.max() > scores.min(), "variants differ in measured score")
Demo 23: Prompt and decoding ablations
This cell studies Prompt and decoding ablations through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 49
header("Demo 23 - Prompt and decoding ablations: finite-sample accuracy interval")
n = 600
y = (np.random.rand(n) < 0.74).astype(float)
mean = y.mean()
se = np.sqrt(mean * (1 - mean) / n)
lo, hi = mean - 1.96 * se, mean + 1.96 * se
print(f"accuracy={mean:.3f}, 95% CI=[{lo:.3f}, {hi:.3f}], n={n}")
check_true(lo <= mean <= hi, "point estimate lies inside its interval")
Demo 24: Retrieval and tool ablations
This cell studies Retrieval and tool ablations through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 51
header("Demo 24 - Retrieval and tool ablations: worst-group risk")
groups = np.array(["A", "B", "C", "D"])
counts = np.array([500, 300, 120, 80])
risks = np.array([0.08, 0.11, 0.19, 0.27])
overall = np.average(risks, weights=counts)
worst = risks.max()
for g, n, r in zip(groups, counts, risks):
print(f"group={g}, n={n}, risk={r:.3f}")
print(f"overall risk={overall:.3f}, worst-group risk={worst:.3f}")
check_true(worst >= overall, "worst-group risk upper-bounds weighted average risk")
Demo 25: Metric ablations
This cell studies Metric ablations through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 53
header("Demo 25 - Metric ablations: confusion matrix")
y_true = np.random.randint(0, 3, size=600)
noise = np.random.rand(600) < 0.18
y_pred = y_true.copy()
y_pred[noise] = np.random.randint(0, 3, size=noise.sum())
cm = np.zeros((3, 3), dtype=int)
for t, p in zip(y_true, y_pred):
cm[t, p] += 1
fig, ax = plt.subplots(figsize=(7, 6))
im = ax.imshow(cm, cmap="viridis")
fig.colorbar(im, ax=ax, label="count")
ax.set_title("Confusion matrix")
ax.set_xlabel("Predicted")
ax.set_ylabel("True")
for i in range(3):
for j in range(3):
ax.text(j, i, str(cm[i, j]), ha="center", va="center", color="white")
fig.tight_layout()
plt.show()
print(cm)
check_true(cm.trace() <= cm.sum(), "diagonal cannot exceed total count")