Theory Notebook
Converted from
theory.ipynbfor web reading.
Monitoring Drift and Retraining
Drift monitoring and retraining convert production telemetry into controlled model maintenance instead of reactive firefighting.
This notebook is the executable companion to notes.md. It uses synthetic production signals so every cell runs without external services or data files.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
def header(title):
print("\n" + "=" * 72)
print(title)
print("=" * 72)
def check_true(condition, name):
ok = bool(condition)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
assert ok, name
def check_close(value, target, tol=1e-8, name="value"):
ok = abs(float(value) - float(target)) <= tol
print(f"{'PASS' if ok else 'FAIL'} - {name}: got {float(value):.6f}, expected {float(target):.6f}")
assert ok, name
def softmax(z):
z = np.asarray(z, dtype=float)
z = z - np.max(z)
e = np.exp(z)
return e / e.sum()
def psi(ref, cur, eps=1e-8):
ref = np.asarray(ref, dtype=float) + eps
cur = np.asarray(cur, dtype=float) + eps
ref = ref / ref.sum()
cur = cur / cur.sum()
return float(np.sum((cur - ref) * np.log(cur / ref)))
def js_divergence(p, q, eps=1e-8):
p = np.asarray(p, dtype=float) + eps
q = np.asarray(q, dtype=float) + eps
p = p / p.sum()
q = q / q.sum()
m = 0.5 * (p + q)
return float(0.5 * np.sum(p * np.log(p / m)) + 0.5 * np.sum(q * np.log(q / m)))
def percentile(values, q):
return float(np.percentile(np.asarray(values, dtype=float), q))
print("Helper functions ready.")
Demo 1: models decay because the world changes
This demo makes the production idea concrete with a small numerical object.
Code cell 5
header("Demo 1 - models decay because the world changes: artifact dependency graph")
nodes = ["raw", "clean", "features", "model", "endpoint"]
edges = [("raw", "clean"), ("clean", "features"), ("features", "model"), ("model", "endpoint")]
adjacency = {node: [] for node in nodes}
for src, dst in edges:
adjacency[src].append(dst)
print("Nodes:", nodes)
print("Edges:", edges)
check_true(len(edges) == len(nodes) - 1, "pipeline has one forward dependency chain")
check_true("model" in adjacency["features"], "features point to model artifact")
print("Production lesson: lineage is a graph, not a folder name.")
Demo 2: monitoring versus evaluation boundary
This demo makes the production idea concrete with a small numerical object.
Code cell 7
header("Demo 2 - monitoring versus evaluation boundary: hash-style version check")
values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
fingerprint = int(np.sum(values * np.arange(1, len(values) + 1)))
same_values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
same_fingerprint = int(np.sum(same_values * np.arange(1, len(same_values) + 1)))
print("Fingerprint:", fingerprint)
check_close(fingerprint, same_fingerprint, name="recomputed fingerprint")
print("Production lesson: deterministic fingerprints make equality auditable.")
Demo 3: data model and business signals
This demo makes the production idea concrete with a small numerical object.
Code cell 9
header("Demo 3 - data model and business signals: release metric comparison")
baseline = np.array([0.72, 0.73, 0.71, 0.74, 0.72])
candidate = np.array([0.74, 0.75, 0.73, 0.76, 0.75])
delta = candidate - baseline
mean_delta = float(delta.mean())
stderr = float(delta.std(ddof=1) / np.sqrt(len(delta)))
print("Mean delta:", round(mean_delta, 4))
print("Standard error:", round(stderr, 4))
check_true(mean_delta > 0, "candidate improves average metric")
print("Production lesson: promotion should record uncertainty, not only a point estimate.")
Demo 4: alert fatigue
This demo makes the production idea concrete with a small numerical object.
Code cell 11
header("Demo 4 - alert fatigue: drift statistic")
ref = np.array([0.20, 0.30, 0.25, 0.25])
cur = np.array([0.10, 0.25, 0.30, 0.35])
score = psi(ref, cur)
print("PSI:", round(score, 6))
check_true(score >= 0, "PSI is nonnegative for positive bins")
fig, ax = plt.subplots()
idx = np.arange(len(ref))
ax.bar(idx - 0.18, ref, width=0.36, color=COLORS["primary"], label="Reference")
ax.bar(idx + 0.18, cur, width=0.36, color=COLORS["secondary"], label="Current")
ax.set_title("Reference versus current production distribution")
ax.set_xlabel("Bin")
ax.set_ylabel("Probability")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Production lesson: drift is a distance between reference and current behavior.")
Demo 5: retraining as a control loop
This demo makes the production idea concrete with a small numerical object.
Code cell 13
header("Demo 5 - retraining as a control loop: latency and tail risk")
latency_ms = np.array([42, 45, 47, 50, 52, 55, 61, 75, 110, 180], dtype=float)
p50 = percentile(latency_ms, 50)
p95 = percentile(latency_ms, 95)
print("p50 latency:", round(p50, 2), "ms")
print("p95 latency:", round(p95, 2), "ms")
check_true(p95 > p50, "tail latency exceeds median latency")
print("Production lesson: users experience tail latency, not average latency.")
Demo 6: reference distribution
This demo makes the production idea concrete with a small numerical object.
Code cell 15
header("Demo 6 - reference distribution $p_{\\mathrm{ref}}$: guardrail decision table")
scores = np.array([0.05, 0.20, 0.45, 0.80, 0.95])
threshold = 0.70
actions = np.where(scores >= threshold, "escalate", "allow")
print("Scores:", scores)
print("Actions:", actions.tolist())
check_true(np.sum(actions == "escalate") == 2, "two requests cross the guardrail threshold")
print("Production lesson: runtime policies are decision functions with thresholds.")
Demo 7: current distribution
This demo makes the production idea concrete with a small numerical object.
Code cell 17
header("Demo 7 - current distribution $p_t$: artifact dependency graph")
nodes = ["raw", "clean", "features", "model", "endpoint"]
edges = [("raw", "clean"), ("clean", "features"), ("features", "model"), ("model", "endpoint")]
adjacency = {node: [] for node in nodes}
for src, dst in edges:
adjacency[src].append(dst)
print("Nodes:", nodes)
print("Edges:", edges)
check_true(len(edges) == len(nodes) - 1, "pipeline has one forward dependency chain")
check_true("model" in adjacency["features"], "features point to model artifact")
print("Production lesson: lineage is a graph, not a folder name.")
Demo 8: drift statistic
This demo makes the production idea concrete with a small numerical object.
Code cell 19
header("Demo 8 - drift statistic: hash-style version check")
values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
fingerprint = int(np.sum(values * np.arange(1, len(values) + 1)))
same_values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
same_fingerprint = int(np.sum(same_values * np.arange(1, len(same_values) + 1)))
print("Fingerprint:", fingerprint)
check_close(fingerprint, same_fingerprint, name="recomputed fingerprint")
print("Production lesson: deterministic fingerprints make equality auditable.")
Demo 9: alert threshold
This demo makes the production idea concrete with a small numerical object.
Code cell 21
header("Demo 9 - alert threshold $\\tau$: release metric comparison")
baseline = np.array([0.72, 0.73, 0.71, 0.74, 0.72])
candidate = np.array([0.74, 0.75, 0.73, 0.76, 0.75])
delta = candidate - baseline
mean_delta = float(delta.mean())
stderr = float(delta.std(ddof=1) / np.sqrt(len(delta)))
print("Mean delta:", round(mean_delta, 4))
print("Standard error:", round(stderr, 4))
check_true(mean_delta > 0, "candidate improves average metric")
print("Production lesson: promotion should record uncertainty, not only a point estimate.")
Demo 10: retraining policy
This demo makes the production idea concrete with a small numerical object.
Code cell 23
header("Demo 10 - retraining policy: drift statistic")
ref = np.array([0.20, 0.30, 0.25, 0.25])
cur = np.array([0.10, 0.25, 0.30, 0.35])
score = psi(ref, cur)
print("PSI:", round(score, 6))
check_true(score >= 0, "PSI is nonnegative for positive bins")
fig, ax = plt.subplots()
idx = np.arange(len(ref))
ax.bar(idx - 0.18, ref, width=0.36, color=COLORS["primary"], label="Reference")
ax.bar(idx + 0.18, cur, width=0.36, color=COLORS["secondary"], label="Current")
ax.set_title("Reference versus current production distribution")
ax.set_xlabel("Bin")
ax.set_ylabel("Probability")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Production lesson: drift is a distance between reference and current behavior.")
Demo 11: input drift
This demo makes the production idea concrete with a small numerical object.
Code cell 25
header("Demo 11 - input drift: latency and tail risk")
latency_ms = np.array([42, 45, 47, 50, 52, 55, 61, 75, 110, 180], dtype=float)
p50 = percentile(latency_ms, 50)
p95 = percentile(latency_ms, 95)
print("p50 latency:", round(p50, 2), "ms")
print("p95 latency:", round(p95, 2), "ms")
check_true(p95 > p50, "tail latency exceeds median latency")
print("Production lesson: users experience tail latency, not average latency.")
Demo 12: prediction drift
This demo makes the production idea concrete with a small numerical object.
Code cell 27
header("Demo 12 - prediction drift: guardrail decision table")
scores = np.array([0.05, 0.20, 0.45, 0.80, 0.95])
threshold = 0.70
actions = np.where(scores >= threshold, "escalate", "allow")
print("Scores:", scores)
print("Actions:", actions.tolist())
check_true(np.sum(actions == "escalate") == 2, "two requests cross the guardrail threshold")
print("Production lesson: runtime policies are decision functions with thresholds.")
Demo 13: label and performance drift
This demo makes the production idea concrete with a small numerical object.
Code cell 29
header("Demo 13 - label and performance drift: artifact dependency graph")
nodes = ["raw", "clean", "features", "model", "endpoint"]
edges = [("raw", "clean"), ("clean", "features"), ("features", "model"), ("model", "endpoint")]
adjacency = {node: [] for node in nodes}
for src, dst in edges:
adjacency[src].append(dst)
print("Nodes:", nodes)
print("Edges:", edges)
check_true(len(edges) == len(nodes) - 1, "pipeline has one forward dependency chain")
check_true("model" in adjacency["features"], "features point to model artifact")
print("Production lesson: lineage is a graph, not a folder name.")
Demo 14: latency and cost
This demo makes the production idea concrete with a small numerical object.
Code cell 31
header("Demo 14 - latency and cost: hash-style version check")
values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
fingerprint = int(np.sum(values * np.arange(1, len(values) + 1)))
same_values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
same_fingerprint = int(np.sum(same_values * np.arange(1, len(same_values) + 1)))
print("Fingerprint:", fingerprint)
check_close(fingerprint, same_fingerprint, name="recomputed fingerprint")
print("Production lesson: deterministic fingerprints make equality auditable.")
Demo 15: data quality
This demo makes the production idea concrete with a small numerical object.
Code cell 33
header("Demo 15 - data quality: release metric comparison")
baseline = np.array([0.72, 0.73, 0.71, 0.74, 0.72])
candidate = np.array([0.74, 0.75, 0.73, 0.76, 0.75])
delta = candidate - baseline
mean_delta = float(delta.mean())
stderr = float(delta.std(ddof=1) / np.sqrt(len(delta)))
print("Mean delta:", round(mean_delta, 4))
print("Standard error:", round(stderr, 4))
check_true(mean_delta > 0, "candidate improves average metric")
print("Production lesson: promotion should record uncertainty, not only a point estimate.")
Demo 16: population stability index
This demo makes the production idea concrete with a small numerical object.
Code cell 35
header("Demo 16 - population stability index: drift statistic")
ref = np.array([0.20, 0.30, 0.25, 0.25])
cur = np.array([0.10, 0.25, 0.30, 0.35])
score = psi(ref, cur)
print("PSI:", round(score, 6))
check_true(score >= 0, "PSI is nonnegative for positive bins")
fig, ax = plt.subplots()
idx = np.arange(len(ref))
ax.bar(idx - 0.18, ref, width=0.36, color=COLORS["primary"], label="Reference")
ax.bar(idx + 0.18, cur, width=0.36, color=COLORS["secondary"], label="Current")
ax.set_title("Reference versus current production distribution")
ax.set_xlabel("Bin")
ax.set_ylabel("Probability")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Production lesson: drift is a distance between reference and current behavior.")
Demo 17: KL and JS divergence
This demo makes the production idea concrete with a small numerical object.
Code cell 37
header("Demo 17 - KL and JS divergence: latency and tail risk")
latency_ms = np.array([42, 45, 47, 50, 52, 55, 61, 75, 110, 180], dtype=float)
p50 = percentile(latency_ms, 50)
p95 = percentile(latency_ms, 95)
print("p50 latency:", round(p50, 2), "ms")
print("p95 latency:", round(p95, 2), "ms")
check_true(p95 > p50, "tail latency exceeds median latency")
print("Production lesson: users experience tail latency, not average latency.")
Demo 18: Wasserstein distance
This demo makes the production idea concrete with a small numerical object.
Code cell 39
header("Demo 18 - Wasserstein distance: guardrail decision table")
scores = np.array([0.05, 0.20, 0.45, 0.80, 0.95])
threshold = 0.70
actions = np.where(scores >= threshold, "escalate", "allow")
print("Scores:", scores)
print("Actions:", actions.tolist())
check_true(np.sum(actions == "escalate") == 2, "two requests cross the guardrail threshold")
print("Production lesson: runtime policies are decision functions with thresholds.")
Demo 19: two-sample tests
This demo makes the production idea concrete with a small numerical object.
Code cell 41
header("Demo 19 - two-sample tests: artifact dependency graph")
nodes = ["raw", "clean", "features", "model", "endpoint"]
edges = [("raw", "clean"), ("clean", "features"), ("features", "model"), ("model", "endpoint")]
adjacency = {node: [] for node in nodes}
for src, dst in edges:
adjacency[src].append(dst)
print("Nodes:", nodes)
print("Edges:", edges)
check_true(len(edges) == len(nodes) - 1, "pipeline has one forward dependency chain")
check_true("model" in adjacency["features"], "features point to model artifact")
print("Production lesson: lineage is a graph, not a folder name.")
Demo 20: embedding drift
This demo makes the production idea concrete with a small numerical object.
Code cell 43
header("Demo 20 - embedding drift: hash-style version check")
values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
fingerprint = int(np.sum(values * np.arange(1, len(values) + 1)))
same_values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
same_fingerprint = int(np.sum(same_values * np.arange(1, len(same_values) + 1)))
print("Fingerprint:", fingerprint)
check_close(fingerprint, same_fingerprint, name="recomputed fingerprint")
print("Production lesson: deterministic fingerprints make equality auditable.")
Demo 21: threshold tuning
This demo makes the production idea concrete with a small numerical object.
Code cell 45
header("Demo 21 - threshold tuning: release metric comparison")
baseline = np.array([0.72, 0.73, 0.71, 0.74, 0.72])
candidate = np.array([0.74, 0.75, 0.73, 0.76, 0.75])
delta = candidate - baseline
mean_delta = float(delta.mean())
stderr = float(delta.std(ddof=1) / np.sqrt(len(delta)))
print("Mean delta:", round(mean_delta, 4))
print("Standard error:", round(stderr, 4))
check_true(mean_delta > 0, "candidate improves average metric")
print("Production lesson: promotion should record uncertainty, not only a point estimate.")
Demo 22: severity levels
This demo makes the production idea concrete with a small numerical object.
Code cell 47
header("Demo 22 - severity levels: drift statistic")
ref = np.array([0.20, 0.30, 0.25, 0.25])
cur = np.array([0.10, 0.25, 0.30, 0.35])
score = psi(ref, cur)
print("PSI:", round(score, 6))
check_true(score >= 0, "PSI is nonnegative for positive bins")
fig, ax = plt.subplots()
idx = np.arange(len(ref))
ax.bar(idx - 0.18, ref, width=0.36, color=COLORS["primary"], label="Reference")
ax.bar(idx + 0.18, cur, width=0.36, color=COLORS["secondary"], label="Current")
ax.set_title("Reference versus current production distribution")
ax.set_xlabel("Bin")
ax.set_ylabel("Probability")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Production lesson: drift is a distance between reference and current behavior.")
Demo 23: slice analysis
This demo makes the production idea concrete with a small numerical object.
Code cell 49
header("Demo 23 - slice analysis: latency and tail risk")
latency_ms = np.array([42, 45, 47, 50, 52, 55, 61, 75, 110, 180], dtype=float)
p50 = percentile(latency_ms, 50)
p95 = percentile(latency_ms, 95)
print("p50 latency:", round(p50, 2), "ms")
print("p95 latency:", round(p95, 2), "ms")
check_true(p95 > p50, "tail latency exceeds median latency")
print("Production lesson: users experience tail latency, not average latency.")
Demo 24: false positives
This demo makes the production idea concrete with a small numerical object.
Code cell 51
header("Demo 24 - false positives: guardrail decision table")
scores = np.array([0.05, 0.20, 0.45, 0.80, 0.95])
threshold = 0.70
actions = np.where(scores >= threshold, "escalate", "allow")
print("Scores:", scores)
print("Actions:", actions.tolist())
check_true(np.sum(actions == "escalate") == 2, "two requests cross the guardrail threshold")
print("Production lesson: runtime policies are decision functions with thresholds.")
Demo 25: root-cause workflow
This demo makes the production idea concrete with a small numerical object.
Code cell 53
header("Demo 25 - root-cause workflow: artifact dependency graph")
nodes = ["raw", "clean", "features", "model", "endpoint"]
edges = [("raw", "clean"), ("clean", "features"), ("features", "model"), ("model", "endpoint")]
adjacency = {node: [] for node in nodes}
for src, dst in edges:
adjacency[src].append(dst)
print("Nodes:", nodes)
print("Edges:", edges)
check_true(len(edges) == len(nodes) - 1, "pipeline has one forward dependency chain")
check_true("model" in adjacency["features"], "features point to model artifact")
print("Production lesson: lineage is a graph, not a folder name.")