Theory Lab
Runnable lab version for web reading.
Instruction Tuning and SFT
Supervised fine-tuning aligns a pretrained next-token model with demonstrated instruction-following behavior by optimizing response tokens under a curated chat protocol.
This notebook is the executable companion to notes.md. It uses synthetic alignment data so the objective, threshold, and feedback mechanics can be studied without external files.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
def header(title):
print("\n" + "=" * 80)
print(title)
print("=" * 80)
def sigmoid(x):
return 1.0 / (1.0 + np.exp(-x))
def softmax(z, axis=-1):
z = np.asarray(z, dtype=float)
z = z - z.max(axis=axis, keepdims=True)
exp_z = np.exp(z)
return exp_z / exp_z.sum(axis=axis, keepdims=True)
def log_softmax(z, axis=-1):
z = np.asarray(z, dtype=float)
shifted = z - z.max(axis=axis, keepdims=True)
return shifted - np.log(np.exp(shifted).sum(axis=axis, keepdims=True))
def check_true(condition, message):
ok = bool(condition)
print(f"{'PASS' if ok else 'FAIL'} - {message}")
assert ok
def check_close(actual, expected, tol=1e-8, message="values close"):
ok = abs(float(actual) - float(expected)) <= tol
print(f"{'PASS' if ok else 'FAIL'} - {message}: actual={float(actual):.6f}, expected={float(expected):.6f}")
assert ok
print("Alignment helper functions loaded.")
Demo 1: Why pretrained next-token models need instruction-following alignment
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 5
header("Demo 1 - Why pretrained next-token models need instruction-following alignment: response-only SFT loss")
logits = np.array([
[2.0, 0.2, -0.5],
[0.1, 1.8, -0.2],
[0.4, 0.0, 1.5],
[1.1, 0.5, -0.4],
])
targets = np.array([0, 1, 2, 0])
response_mask = np.array([0, 1, 1, 1], dtype=float)
log_probs = log_softmax(logits, axis=1)
nll = -log_probs[np.arange(len(targets)), targets]
loss = (nll * response_mask).sum() / response_mask.sum()
print("token NLL:", np.round(nll, 4))
print(f"masked response loss={loss:.4f}")
check_true(response_mask[0] == 0, "prompt token is excluded from SFT loss")
Demo 2: Instruction following as behavior conditioning
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 7
header("Demo 2 - Instruction following as behavior conditioning: loss-mask visualization")
mask = np.array([[0, 0, 1, 1, 1, 0], [0, 1, 1, 0, 0, 0], [0, 0, 0, 1, 1, 1]])
fig, ax = plt.subplots(figsize=(8, 4))
im = ax.imshow(mask, cmap="viridis", aspect="auto")
fig.colorbar(im, ax=ax, label="loss weight")
ax.set_title("Response-token loss mask")
ax.set_xlabel("Token position")
ax.set_ylabel("Example")
fig.tight_layout()
plt.show()
print("active response tokens:", int(mask.sum()))
check_true(mask.sum() > 0, "mask contains trainable response tokens")
Demo 3: Demonstrations versus preferences
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 9
header("Demo 3 - Demonstrations versus preferences: SFT validation curve")
steps = np.arange(1, 41)
train = 1.8 * np.exp(-steps / 18) + 0.25
val = 1.4 * np.exp(-steps / 22) + 0.38 + 0.006 * np.maximum(steps - 26, 0)
fig, ax = plt.subplots()
ax.plot(steps, train, color=COLORS["primary"], label="train")
ax.plot(steps, val, color=COLORS["secondary"], label="validation")
ax.set_title("SFT loss curves")
ax.set_xlabel("Step")
ax.set_ylabel("Cross-entropy loss")
ax.legend()
fig.tight_layout()
plt.show()
best_step = int(steps[val.argmin()])
print(f"best validation step={best_step}")
check_true(best_step > 1, "validation curve has an interior best step")
Demo 4: Chat templates as part of the objective
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 11
header("Demo 4 - Chat templates as part of the objective: guardrail threshold tradeoff")
scores_bad = np.random.beta(6, 2, size=500)
scores_good = np.random.beta(2, 6, size=500)
thresholds = np.linspace(0.05, 0.95, 30)
tpr, fpr = [], []
for tau in thresholds:
tpr.append((scores_bad >= tau).mean())
fpr.append((scores_good >= tau).mean())
fig, ax = plt.subplots()
ax.plot(thresholds, tpr, color=COLORS["primary"], label="block harmful")
ax.plot(thresholds, fpr, color=COLORS["secondary"], label="block benign")
ax.set_title("Guardrail threshold tradeoff")
ax.set_xlabel("Threshold")
ax.set_ylabel("Rate")
ax.legend()
fig.tight_layout()
plt.show()
check_true(max(tpr) <= 1 and min(fpr) >= 0, "rates stay in [0, 1]")
Demo 5: Historical arc from FLAN to InstructGPT
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 13
header("Demo 5 - Historical arc from FLAN to InstructGPT: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")
Demo 6: Prompt and response
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 15
header("Demo 6 - Prompt $x$ and response $y$: response-only SFT loss")
logits = np.array([
[2.0, 0.2, -0.5],
[0.1, 1.8, -0.2],
[0.4, 0.0, 1.5],
[1.1, 0.5, -0.4],
])
targets = np.array([0, 1, 2, 0])
response_mask = np.array([0, 1, 1, 1], dtype=float)
log_probs = log_softmax(logits, axis=1)
nll = -log_probs[np.arange(len(targets)), targets]
loss = (nll * response_mask).sum() / response_mask.sum()
print("token NLL:", np.round(nll, 4))
print(f"masked response loss={loss:.4f}")
check_true(response_mask[0] == 0, "prompt token is excluded from SFT loss")
Demo 7: Demonstration dataset
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 17
header("Demo 7 - Demonstration dataset $\mathcal{D}_{\mathrm{SFT}}$: loss-mask visualization")
mask = np.array([[0, 0, 1, 1, 1, 0], [0, 1, 1, 0, 0, 0], [0, 0, 0, 1, 1, 1]])
fig, ax = plt.subplots(figsize=(8, 4))
im = ax.imshow(mask, cmap="viridis", aspect="auto")
fig.colorbar(im, ax=ax, label="loss weight")
ax.set_title("Response-token loss mask")
ax.set_xlabel("Token position")
ax.set_ylabel("Example")
fig.tight_layout()
plt.show()
print("active response tokens:", int(mask.sum()))
check_true(mask.sum() > 0, "mask contains trainable response tokens")
Demo 8: Policy
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 19
header("Demo 8 - Policy $\pi_\theta(y \mid x)$: SFT validation curve")
steps = np.arange(1, 41)
train = 1.8 * np.exp(-steps / 18) + 0.25
val = 1.4 * np.exp(-steps / 22) + 0.38 + 0.006 * np.maximum(steps - 26, 0)
fig, ax = plt.subplots()
ax.plot(steps, train, color=COLORS["primary"], label="train")
ax.plot(steps, val, color=COLORS["secondary"], label="validation")
ax.set_title("SFT loss curves")
ax.set_xlabel("Step")
ax.set_ylabel("Cross-entropy loss")
ax.legend()
fig.tight_layout()
plt.show()
best_step = int(steps[val.argmin()])
print(f"best validation step={best_step}")
check_true(best_step > 1, "validation curve has an interior best step")
Demo 9: Response-token mask
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 21
header("Demo 9 - Response-token mask: guardrail threshold tradeoff")
scores_bad = np.random.beta(6, 2, size=500)
scores_good = np.random.beta(2, 6, size=500)
thresholds = np.linspace(0.05, 0.95, 30)
tpr, fpr = [], []
for tau in thresholds:
tpr.append((scores_bad >= tau).mean())
fpr.append((scores_good >= tau).mean())
fig, ax = plt.subplots()
ax.plot(thresholds, tpr, color=COLORS["primary"], label="block harmful")
ax.plot(thresholds, fpr, color=COLORS["secondary"], label="block benign")
ax.set_title("Guardrail threshold tradeoff")
ax.set_xlabel("Threshold")
ax.set_ylabel("Rate")
ax.legend()
fig.tight_layout()
plt.show()
check_true(max(tpr) <= 1 and min(fpr) >= 0, "rates stay in [0, 1]")
Demo 10: Instruction distribution and validation split
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 23
header("Demo 10 - Instruction distribution and validation split: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")
Demo 11: Task diversity
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 25
header("Demo 11 - Task diversity: response-only SFT loss")
logits = np.array([
[2.0, 0.2, -0.5],
[0.1, 1.8, -0.2],
[0.4, 0.0, 1.5],
[1.1, 0.5, -0.4],
])
targets = np.array([0, 1, 2, 0])
response_mask = np.array([0, 1, 1, 1], dtype=float)
log_probs = log_softmax(logits, axis=1)
nll = -log_probs[np.arange(len(targets)), targets]
loss = (nll * response_mask).sum() / response_mask.sum()
print("token NLL:", np.round(nll, 4))
print(f"masked response loss={loss:.4f}")
check_true(response_mask[0] == 0, "prompt token is excluded from SFT loss")
Demo 12: Chat templates
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 27
header("Demo 12 - Chat templates: loss-mask visualization")
mask = np.array([[0, 0, 1, 1, 1, 0], [0, 1, 1, 0, 0, 0], [0, 0, 0, 1, 1, 1]])
fig, ax = plt.subplots(figsize=(8, 4))
im = ax.imshow(mask, cmap="viridis", aspect="auto")
fig.colorbar(im, ax=ax, label="loss weight")
ax.set_title("Response-token loss mask")
ax.set_xlabel("Token position")
ax.set_ylabel("Example")
fig.tight_layout()
plt.show()
print("active response tokens:", int(mask.sum()))
check_true(mask.sum() > 0, "mask contains trainable response tokens")
Demo 13: Role tokens
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 29
header("Demo 13 - Role tokens: SFT validation curve")
steps = np.arange(1, 41)
train = 1.8 * np.exp(-steps / 18) + 0.25
val = 1.4 * np.exp(-steps / 22) + 0.38 + 0.006 * np.maximum(steps - 26, 0)
fig, ax = plt.subplots()
ax.plot(steps, train, color=COLORS["primary"], label="train")
ax.plot(steps, val, color=COLORS["secondary"], label="validation")
ax.set_title("SFT loss curves")
ax.set_xlabel("Step")
ax.set_ylabel("Cross-entropy loss")
ax.legend()
fig.tight_layout()
plt.show()
best_step = int(steps[val.argmin()])
print(f"best validation step={best_step}")
check_true(best_step > 1, "validation curve has an interior best step")
Demo 14: Refusal examples
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 31
header("Demo 14 - Refusal examples: guardrail threshold tradeoff")
scores_bad = np.random.beta(6, 2, size=500)
scores_good = np.random.beta(2, 6, size=500)
thresholds = np.linspace(0.05, 0.95, 30)
tpr, fpr = [], []
for tau in thresholds:
tpr.append((scores_bad >= tau).mean())
fpr.append((scores_good >= tau).mean())
fig, ax = plt.subplots()
ax.plot(thresholds, tpr, color=COLORS["primary"], label="block harmful")
ax.plot(thresholds, fpr, color=COLORS["secondary"], label="block benign")
ax.set_title("Guardrail threshold tradeoff")
ax.set_xlabel("Threshold")
ax.set_ylabel("Rate")
ax.legend()
fig.tight_layout()
plt.show()
check_true(max(tpr) <= 1 and min(fpr) >= 0, "rates stay in [0, 1]")
Demo 15: Multi-turn demonstrations
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 33
header("Demo 15 - Multi-turn demonstrations: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")
Demo 16: Response-only cross-entropy
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 35
header("Demo 16 - Response-only cross-entropy: response-only SFT loss")
logits = np.array([
[2.0, 0.2, -0.5],
[0.1, 1.8, -0.2],
[0.4, 0.0, 1.5],
[1.1, 0.5, -0.4],
])
targets = np.array([0, 1, 2, 0])
response_mask = np.array([0, 1, 1, 1], dtype=float)
log_probs = log_softmax(logits, axis=1)
nll = -log_probs[np.arange(len(targets)), targets]
loss = (nll * response_mask).sum() / response_mask.sum()
print("token NLL:", np.round(nll, 4))
print(f"masked response loss={loss:.4f}")
check_true(response_mask[0] == 0, "prompt token is excluded from SFT loss")
Demo 17: Packed examples
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 37
header("Demo 17 - Packed examples: loss-mask visualization")
mask = np.array([[0, 0, 1, 1, 1, 0], [0, 1, 1, 0, 0, 0], [0, 0, 0, 1, 1, 1]])
fig, ax = plt.subplots(figsize=(8, 4))
im = ax.imshow(mask, cmap="viridis", aspect="auto")
fig.colorbar(im, ax=ax, label="loss weight")
ax.set_title("Response-token loss mask")
ax.set_xlabel("Token position")
ax.set_ylabel("Example")
fig.tight_layout()
plt.show()
print("active response tokens:", int(mask.sum()))
check_true(mask.sum() > 0, "mask contains trainable response tokens")
Demo 18: Loss masks
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 39
header("Demo 18 - Loss masks: SFT validation curve")
steps = np.arange(1, 41)
train = 1.8 * np.exp(-steps / 18) + 0.25
val = 1.4 * np.exp(-steps / 22) + 0.38 + 0.006 * np.maximum(steps - 26, 0)
fig, ax = plt.subplots()
ax.plot(steps, train, color=COLORS["primary"], label="train")
ax.plot(steps, val, color=COLORS["secondary"], label="validation")
ax.set_title("SFT loss curves")
ax.set_xlabel("Step")
ax.set_ylabel("Cross-entropy loss")
ax.legend()
fig.tight_layout()
plt.show()
best_step = int(steps[val.argmin()])
print(f"best validation step={best_step}")
check_true(best_step > 1, "validation curve has an interior best step")
Demo 19: Class imbalance
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 41
header("Demo 19 - Class imbalance: guardrail threshold tradeoff")
scores_bad = np.random.beta(6, 2, size=500)
scores_good = np.random.beta(2, 6, size=500)
thresholds = np.linspace(0.05, 0.95, 30)
tpr, fpr = [], []
for tau in thresholds:
tpr.append((scores_bad >= tau).mean())
fpr.append((scores_good >= tau).mean())
fig, ax = plt.subplots()
ax.plot(thresholds, tpr, color=COLORS["primary"], label="block harmful")
ax.plot(thresholds, fpr, color=COLORS["secondary"], label="block benign")
ax.set_title("Guardrail threshold tradeoff")
ax.set_xlabel("Threshold")
ax.set_ylabel("Rate")
ax.legend()
fig.tight_layout()
plt.show()
check_true(max(tpr) <= 1 and min(fpr) >= 0, "rates stay in [0, 1]")
Demo 20: Validation curves
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 43
header("Demo 20 - Validation curves: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")
Demo 21: Helpfulness
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 45
header("Demo 21 - Helpfulness: response-only SFT loss")
logits = np.array([
[2.0, 0.2, -0.5],
[0.1, 1.8, -0.2],
[0.4, 0.0, 1.5],
[1.1, 0.5, -0.4],
])
targets = np.array([0, 1, 2, 0])
response_mask = np.array([0, 1, 1, 1], dtype=float)
log_probs = log_softmax(logits, axis=1)
nll = -log_probs[np.arange(len(targets)), targets]
loss = (nll * response_mask).sum() / response_mask.sum()
print("token NLL:", np.round(nll, 4))
print(f"masked response loss={loss:.4f}")
check_true(response_mask[0] == 0, "prompt token is excluded from SFT loss")
Demo 22: Honesty
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 47
header("Demo 22 - Honesty: loss-mask visualization")
mask = np.array([[0, 0, 1, 1, 1, 0], [0, 1, 1, 0, 0, 0], [0, 0, 0, 1, 1, 1]])
fig, ax = plt.subplots(figsize=(8, 4))
im = ax.imshow(mask, cmap="viridis", aspect="auto")
fig.colorbar(im, ax=ax, label="loss weight")
ax.set_title("Response-token loss mask")
ax.set_xlabel("Token position")
ax.set_ylabel("Example")
fig.tight_layout()
plt.show()
print("active response tokens:", int(mask.sum()))
check_true(mask.sum() > 0, "mask contains trainable response tokens")
Demo 23: Harmlessness
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 49
header("Demo 23 - Harmlessness: SFT validation curve")
steps = np.arange(1, 41)
train = 1.8 * np.exp(-steps / 18) + 0.25
val = 1.4 * np.exp(-steps / 22) + 0.38 + 0.006 * np.maximum(steps - 26, 0)
fig, ax = plt.subplots()
ax.plot(steps, train, color=COLORS["primary"], label="train")
ax.plot(steps, val, color=COLORS["secondary"], label="validation")
ax.set_title("SFT loss curves")
ax.set_xlabel("Step")
ax.set_ylabel("Cross-entropy loss")
ax.legend()
fig.tight_layout()
plt.show()
best_step = int(steps[val.argmin()])
print(f"best validation step={best_step}")
check_true(best_step > 1, "validation curve has an interior best step")
Demo 24: Style control
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 51
header("Demo 24 - Style control: guardrail threshold tradeoff")
scores_bad = np.random.beta(6, 2, size=500)
scores_good = np.random.beta(2, 6, size=500)
thresholds = np.linspace(0.05, 0.95, 30)
tpr, fpr = [], []
for tau in thresholds:
tpr.append((scores_bad >= tau).mean())
fpr.append((scores_good >= tau).mean())
fig, ax = plt.subplots()
ax.plot(thresholds, tpr, color=COLORS["primary"], label="block harmful")
ax.plot(thresholds, fpr, color=COLORS["secondary"], label="block benign")
ax.set_title("Guardrail threshold tradeoff")
ax.set_xlabel("Threshold")
ax.set_ylabel("Rate")
ax.legend()
fig.tight_layout()
plt.show()
check_true(max(tpr) <= 1 and min(fpr) >= 0, "rates stay in [0, 1]")
Demo 25: Sycophancy risk
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 53
header("Demo 25 - Sycophancy risk: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")