Theory Notebook
Converted from
theory.ipynbfor web reading.
Policy and Guardrails
Policies define desired and disallowed behavior, while guardrails translate those rules into classifiers, validators, gates, and intervention actions.
This notebook is the executable companion to notes.md. It uses synthetic alignment data so the objective, threshold, and feedback mechanics can be studied without external files.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
def header(title):
print("\n" + "=" * 80)
print(title)
print("=" * 80)
def sigmoid(x):
return 1.0 / (1.0 + np.exp(-x))
def softmax(z, axis=-1):
z = np.asarray(z, dtype=float)
z = z - z.max(axis=axis, keepdims=True)
exp_z = np.exp(z)
return exp_z / exp_z.sum(axis=axis, keepdims=True)
def log_softmax(z, axis=-1):
z = np.asarray(z, dtype=float)
shifted = z - z.max(axis=axis, keepdims=True)
return shifted - np.log(np.exp(shifted).sum(axis=axis, keepdims=True))
def check_true(condition, message):
ok = bool(condition)
print(f"{'PASS' if ok else 'FAIL'} - {message}")
assert ok
def check_close(actual, expected, tol=1e-8, message="values close"):
ok = abs(float(actual) - float(expected)) <= tol
print(f"{'PASS' if ok else 'FAIL'} - {message}: actual={float(actual):.6f}, expected={float(expected):.6f}")
assert ok
print("Alignment helper functions loaded.")
Demo 1: Alignment needs explicit policy
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 5
header("Demo 1 - Alignment needs explicit policy: guardrail threshold tradeoff")
scores_bad = np.random.beta(6, 2, size=500)
scores_good = np.random.beta(2, 6, size=500)
thresholds = np.linspace(0.05, 0.95, 30)
tpr, fpr = [], []
for tau in thresholds:
tpr.append((scores_bad >= tau).mean())
fpr.append((scores_good >= tau).mean())
fig, ax = plt.subplots()
ax.plot(thresholds, tpr, color=COLORS["primary"], label="block harmful")
ax.plot(thresholds, fpr, color=COLORS["secondary"], label="block benign")
ax.set_title("Guardrail threshold tradeoff")
ax.set_xlabel("Threshold")
ax.set_ylabel("Rate")
ax.legend()
fig.tight_layout()
plt.show()
check_true(max(tpr) <= 1 and min(fpr) >= 0, "rates stay in [0, 1]")
Demo 2: Guardrails as runtime control
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 7
header("Demo 2 - Guardrails as runtime control: policy category matrix")
matrix = np.array([
[0.02, 0.08, 0.25],
[0.04, 0.20, 0.62],
[0.12, 0.45, 0.88],
])
fig, ax = plt.subplots(figsize=(7, 5))
im = ax.imshow(matrix, cmap="viridis")
fig.colorbar(im, ax=ax, label="block probability")
ax.set_title("Guardrail block probability by intent and harm")
ax.set_xlabel("Harm severity")
ax.set_ylabel("User intent risk")
fig.tight_layout()
plt.show()
print("highest-risk cell:", matrix.max())
check_true(matrix.max() > matrix.min(), "policy matrix distinguishes risk levels")
Demo 3: Policy hierarchy and conflict resolution
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 9
header("Demo 3 - Policy hierarchy and conflict resolution: attack success rate")
n = 300
severity = np.random.choice([0, 1, 2, 3], size=n, p=[0.68, 0.18, 0.10, 0.04])
violated = severity >= 2
asr = violated.mean()
se = np.sqrt(asr * (1 - asr) / n)
print(f"ASR={asr:.3f}, 95% CI=[{asr-1.96*se:.3f}, {asr+1.96*se:.3f}]")
check_true(0 <= asr <= 1, "attack success rate is a probability")
Demo 4: Safety is not only refusal
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 11
header("Demo 4 - Safety is not only refusal: reviewer agreement")
labels_a = np.random.choice([0, 1], size=300, p=[0.7, 0.3])
flip = np.random.rand(300) < 0.12
labels_b = np.where(flip, 1 - labels_a, labels_a)
agree = (labels_a == labels_b).mean()
p_yes_a, p_yes_b = labels_a.mean(), labels_b.mean()
chance = p_yes_a * p_yes_b + (1 - p_yes_a) * (1 - p_yes_b)
kappa = (agree - chance) / (1 - chance)
print(f"agreement={agree:.3f}, Cohen kappa={kappa:.3f}")
check_true(kappa <= 1, "kappa is bounded above by 1")
Demo 5: Why system prompts alone are insufficient
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 13
header("Demo 5 - Why system prompts alone are insufficient: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")
Demo 6: Policy rule
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 15
header("Demo 6 - Policy rule: guardrail threshold tradeoff")
scores_bad = np.random.beta(6, 2, size=500)
scores_good = np.random.beta(2, 6, size=500)
thresholds = np.linspace(0.05, 0.95, 30)
tpr, fpr = [], []
for tau in thresholds:
tpr.append((scores_bad >= tau).mean())
fpr.append((scores_good >= tau).mean())
fig, ax = plt.subplots()
ax.plot(thresholds, tpr, color=COLORS["primary"], label="block harmful")
ax.plot(thresholds, fpr, color=COLORS["secondary"], label="block benign")
ax.set_title("Guardrail threshold tradeoff")
ax.set_xlabel("Threshold")
ax.set_ylabel("Rate")
ax.legend()
fig.tight_layout()
plt.show()
check_true(max(tpr) <= 1 and min(fpr) >= 0, "rates stay in [0, 1]")
Demo 7: Allowed and disallowed sets
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 17
header("Demo 7 - Allowed and disallowed sets: policy category matrix")
matrix = np.array([
[0.02, 0.08, 0.25],
[0.04, 0.20, 0.62],
[0.12, 0.45, 0.88],
])
fig, ax = plt.subplots(figsize=(7, 5))
im = ax.imshow(matrix, cmap="viridis")
fig.colorbar(im, ax=ax, label="block probability")
ax.set_title("Guardrail block probability by intent and harm")
ax.set_xlabel("Harm severity")
ax.set_ylabel("User intent risk")
fig.tight_layout()
plt.show()
print("highest-risk cell:", matrix.max())
check_true(matrix.max() > matrix.min(), "policy matrix distinguishes risk levels")
Demo 8: Classifier
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 19
header("Demo 8 - Classifier $c(x,y)$: attack success rate")
n = 300
severity = np.random.choice([0, 1, 2, 3], size=n, p=[0.68, 0.18, 0.10, 0.04])
violated = severity >= 2
asr = violated.mean()
se = np.sqrt(asr * (1 - asr) / n)
print(f"ASR={asr:.3f}, 95% CI=[{asr-1.96*se:.3f}, {asr+1.96*se:.3f}]")
check_true(0 <= asr <= 1, "attack success rate is a probability")
Demo 9: Guardrail action
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 21
header("Demo 9 - Guardrail action $a$: reviewer agreement")
labels_a = np.random.choice([0, 1], size=300, p=[0.7, 0.3])
flip = np.random.rand(300) < 0.12
labels_b = np.where(flip, 1 - labels_a, labels_a)
agree = (labels_a == labels_b).mean()
p_yes_a, p_yes_b = labels_a.mean(), labels_b.mean()
chance = p_yes_a * p_yes_b + (1 - p_yes_a) * (1 - p_yes_b)
kappa = (agree - chance) / (1 - chance)
print(f"agreement={agree:.3f}, Cohen kappa={kappa:.3f}")
check_true(kappa <= 1, "kappa is bounded above by 1")
Demo 10: Risk threshold
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 23
header("Demo 10 - Risk threshold $\tau$: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")
Demo 11: Harm categories
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 25
header("Demo 11 - Harm categories: guardrail threshold tradeoff")
scores_bad = np.random.beta(6, 2, size=500)
scores_good = np.random.beta(2, 6, size=500)
thresholds = np.linspace(0.05, 0.95, 30)
tpr, fpr = [], []
for tau in thresholds:
tpr.append((scores_bad >= tau).mean())
fpr.append((scores_good >= tau).mean())
fig, ax = plt.subplots()
ax.plot(thresholds, tpr, color=COLORS["primary"], label="block harmful")
ax.plot(thresholds, fpr, color=COLORS["secondary"], label="block benign")
ax.set_title("Guardrail threshold tradeoff")
ax.set_xlabel("Threshold")
ax.set_ylabel("Rate")
ax.legend()
fig.tight_layout()
plt.show()
check_true(max(tpr) <= 1 and min(fpr) >= 0, "rates stay in [0, 1]")
Demo 12: Capability boundaries
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 27
header("Demo 12 - Capability boundaries: policy category matrix")
matrix = np.array([
[0.02, 0.08, 0.25],
[0.04, 0.20, 0.62],
[0.12, 0.45, 0.88],
])
fig, ax = plt.subplots(figsize=(7, 5))
im = ax.imshow(matrix, cmap="viridis")
fig.colorbar(im, ax=ax, label="block probability")
ax.set_title("Guardrail block probability by intent and harm")
ax.set_xlabel("Harm severity")
ax.set_ylabel("User intent risk")
fig.tight_layout()
plt.show()
print("highest-risk cell:", matrix.max())
check_true(matrix.max() > matrix.min(), "policy matrix distinguishes risk levels")
Demo 13: User intent
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 29
header("Demo 13 - User intent: attack success rate")
n = 300
severity = np.random.choice([0, 1, 2, 3], size=n, p=[0.68, 0.18, 0.10, 0.04])
violated = severity >= 2
asr = violated.mean()
se = np.sqrt(asr * (1 - asr) / n)
print(f"ASR={asr:.3f}, 95% CI=[{asr-1.96*se:.3f}, {asr+1.96*se:.3f}]")
check_true(0 <= asr <= 1, "attack success rate is a probability")
Demo 14: Context sensitivity
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 31
header("Demo 14 - Context sensitivity: reviewer agreement")
labels_a = np.random.choice([0, 1], size=300, p=[0.7, 0.3])
flip = np.random.rand(300) < 0.12
labels_b = np.where(flip, 1 - labels_a, labels_a)
agree = (labels_a == labels_b).mean()
p_yes_a, p_yes_b = labels_a.mean(), labels_b.mean()
chance = p_yes_a * p_yes_b + (1 - p_yes_a) * (1 - p_yes_b)
kappa = (agree - chance) / (1 - chance)
print(f"agreement={agree:.3f}, Cohen kappa={kappa:.3f}")
check_true(kappa <= 1, "kappa is bounded above by 1")
Demo 15: Jurisdiction notes
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 33
header("Demo 15 - Jurisdiction notes: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")
Demo 16: Critique-revise loops
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 35
header("Demo 16 - Critique-revise loops: guardrail threshold tradeoff")
scores_bad = np.random.beta(6, 2, size=500)
scores_good = np.random.beta(2, 6, size=500)
thresholds = np.linspace(0.05, 0.95, 30)
tpr, fpr = [], []
for tau in thresholds:
tpr.append((scores_bad >= tau).mean())
fpr.append((scores_good >= tau).mean())
fig, ax = plt.subplots()
ax.plot(thresholds, tpr, color=COLORS["primary"], label="block harmful")
ax.plot(thresholds, fpr, color=COLORS["secondary"], label="block benign")
ax.set_title("Guardrail threshold tradeoff")
ax.set_xlabel("Threshold")
ax.set_ylabel("Rate")
ax.legend()
fig.tight_layout()
plt.show()
check_true(max(tpr) <= 1 and min(fpr) >= 0, "rates stay in [0, 1]")
Demo 17: AI feedback
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 37
header("Demo 17 - AI feedback: policy category matrix")
matrix = np.array([
[0.02, 0.08, 0.25],
[0.04, 0.20, 0.62],
[0.12, 0.45, 0.88],
])
fig, ax = plt.subplots(figsize=(7, 5))
im = ax.imshow(matrix, cmap="viridis")
fig.colorbar(im, ax=ax, label="block probability")
ax.set_title("Guardrail block probability by intent and harm")
ax.set_xlabel("Harm severity")
ax.set_ylabel("User intent risk")
fig.tight_layout()
plt.show()
print("highest-risk cell:", matrix.max())
check_true(matrix.max() > matrix.min(), "policy matrix distinguishes risk levels")
Demo 18: Rule hierarchy
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 39
header("Demo 18 - Rule hierarchy: attack success rate")
n = 300
severity = np.random.choice([0, 1, 2, 3], size=n, p=[0.68, 0.18, 0.10, 0.04])
violated = severity >= 2
asr = violated.mean()
se = np.sqrt(asr * (1 - asr) / n)
print(f"ASR={asr:.3f}, 95% CI=[{asr-1.96*se:.3f}, {asr+1.96*se:.3f}]")
check_true(0 <= asr <= 1, "attack success rate is a probability")
Demo 19: Policy distillation
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 41
header("Demo 19 - Policy distillation: reviewer agreement")
labels_a = np.random.choice([0, 1], size=300, p=[0.7, 0.3])
flip = np.random.rand(300) < 0.12
labels_b = np.where(flip, 1 - labels_a, labels_a)
agree = (labels_a == labels_b).mean()
p_yes_a, p_yes_b = labels_a.mean(), labels_b.mean()
chance = p_yes_a * p_yes_b + (1 - p_yes_a) * (1 - p_yes_b)
kappa = (agree - chance) / (1 - chance)
print(f"agreement={agree:.3f}, Cohen kappa={kappa:.3f}")
check_true(kappa <= 1, "kappa is bounded above by 1")
Demo 20: Constitutional evaluation
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 43
header("Demo 20 - Constitutional evaluation: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")
Demo 21: Input filters
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 45
header("Demo 21 - Input filters: guardrail threshold tradeoff")
scores_bad = np.random.beta(6, 2, size=500)
scores_good = np.random.beta(2, 6, size=500)
thresholds = np.linspace(0.05, 0.95, 30)
tpr, fpr = [], []
for tau in thresholds:
tpr.append((scores_bad >= tau).mean())
fpr.append((scores_good >= tau).mean())
fig, ax = plt.subplots()
ax.plot(thresholds, tpr, color=COLORS["primary"], label="block harmful")
ax.plot(thresholds, fpr, color=COLORS["secondary"], label="block benign")
ax.set_title("Guardrail threshold tradeoff")
ax.set_xlabel("Threshold")
ax.set_ylabel("Rate")
ax.legend()
fig.tight_layout()
plt.show()
check_true(max(tpr) <= 1 and min(fpr) >= 0, "rates stay in [0, 1]")
Demo 22: Output filters
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 47
header("Demo 22 - Output filters: policy category matrix")
matrix = np.array([
[0.02, 0.08, 0.25],
[0.04, 0.20, 0.62],
[0.12, 0.45, 0.88],
])
fig, ax = plt.subplots(figsize=(7, 5))
im = ax.imshow(matrix, cmap="viridis")
fig.colorbar(im, ax=ax, label="block probability")
ax.set_title("Guardrail block probability by intent and harm")
ax.set_xlabel("Harm severity")
ax.set_ylabel("User intent risk")
fig.tight_layout()
plt.show()
print("highest-risk cell:", matrix.max())
check_true(matrix.max() > matrix.min(), "policy matrix distinguishes risk levels")
Demo 23: Tool gates
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 49
header("Demo 23 - Tool gates: attack success rate")
n = 300
severity = np.random.choice([0, 1, 2, 3], size=n, p=[0.68, 0.18, 0.10, 0.04])
violated = severity >= 2
asr = violated.mean()
se = np.sqrt(asr * (1 - asr) / n)
print(f"ASR={asr:.3f}, 95% CI=[{asr-1.96*se:.3f}, {asr+1.96*se:.3f}]")
check_true(0 <= asr <= 1, "attack success rate is a probability")
Demo 24: Retrieval gates
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 51
header("Demo 24 - Retrieval gates: reviewer agreement")
labels_a = np.random.choice([0, 1], size=300, p=[0.7, 0.3])
flip = np.random.rand(300) < 0.12
labels_b = np.where(flip, 1 - labels_a, labels_a)
agree = (labels_a == labels_b).mean()
p_yes_a, p_yes_b = labels_a.mean(), labels_b.mean()
chance = p_yes_a * p_yes_b + (1 - p_yes_a) * (1 - p_yes_b)
kappa = (agree - chance) / (1 - chance)
print(f"agreement={agree:.3f}, Cohen kappa={kappa:.3f}")
check_true(kappa <= 1, "kappa is bounded above by 1")
Demo 25: Structured validators
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 53
header("Demo 25 - Structured validators: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")