Theory Notebook
2 min read18 headings
Theory Notebook
Converted from
theory.ipynbfor web reading.
Loss Functions - Theory Notebook
This notebook is the executable companion to notes.md. It turns loss formulas
into curves, gradients, masking rules, and stability checks.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
Code cell 4
def sigmoid(z):
z = np.asarray(z, dtype=float)
return 1.0 / (1.0 + np.exp(-z))
def softmax(logits, axis=-1):
logits = np.asarray(logits, dtype=float)
shifted = logits - np.max(logits, axis=axis, keepdims=True)
exp_shifted = np.exp(shifted)
return exp_shifted / np.sum(exp_shifted, axis=axis, keepdims=True)
def logsumexp(logits, axis=-1, keepdims=False):
logits = np.asarray(logits, dtype=float)
m = np.max(logits, axis=axis, keepdims=True)
out = m + np.log(np.sum(np.exp(logits - m), axis=axis, keepdims=True))
if not keepdims:
out = np.squeeze(out, axis=axis)
return out
def check_close(name, value, expected, tol=1e-8):
ok = np.allclose(value, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} - {name}: value={value}, expected={expected}")
return ok
def check_true(name, condition):
ok = bool(condition)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
return ok
print("Helper functions ready.")
1. Loss as a training signal
Code cell 6
y_true = np.array([2.0, -1.0, 0.5, 3.0])
y_pred = np.array([1.5, -0.5, 1.0, 10.0])
residual = y_pred - y_true
print("Residuals:", residual)
print("Large residual dominates MSE:", residual[-1] ** 2)
print("Large residual has linear MAE penalty:", abs(residual[-1]))
Code cell 7
r = np.linspace(-5, 5, 401)
mse = r ** 2
mae = np.abs(r)
delta = 1.0
huber = np.where(np.abs(r) <= delta, 0.5 * r**2, delta * (np.abs(r) - 0.5 * delta))
fig, ax = plt.subplots()
ax.plot(r, mse, label="MSE", color=COLORS["primary"])
ax.plot(r, mae, label="MAE", color=COLORS["secondary"])
ax.plot(r, huber, label="Huber delta=1", color=COLORS["tertiary"])
ax.set_title("Regression loss curves")
ax.set_xlabel("Residual $r$")
ax.set_ylabel("Loss")
ax.legend()
fig.tight_layout()
plt.show()
print("Plotted MSE, MAE, and Huber loss.")
2. Regression gradients
Code cell 9
mse_grad = 2 * r
mae_grad = np.sign(r)
huber_grad = np.where(np.abs(r) <= delta, r, delta * np.sign(r))
fig, ax = plt.subplots()
ax.plot(r, mse_grad, label="MSE gradient", color=COLORS["primary"])
ax.plot(r, mae_grad, label="MAE subgradient", color=COLORS["secondary"])
ax.plot(r, huber_grad, label="Huber gradient", color=COLORS["tertiary"])
ax.set_title("Gradient scale differs by loss")
ax.set_xlabel("Residual $r$")
ax.set_ylabel("Gradient with respect to prediction")
ax.legend()
fig.tight_layout()
plt.show()
print("Gradient scale comparison complete.")
Code cell 10
def quantile_loss(y, q, tau):
residual = y - q
return np.maximum(tau * residual, (tau - 1) * residual)
taus = [0.1, 0.5, 0.9]
fig, ax = plt.subplots()
for tau, color in zip(taus, [COLORS["primary"], COLORS["secondary"], COLORS["tertiary"]]):
ax.plot(r, quantile_loss(r, 0.0, tau), label=f"tau={tau}", color=color)
ax.set_title("Quantile loss is asymmetric")
ax.set_xlabel("Target residual $y-q$")
ax.set_ylabel("Loss")
ax.legend()
fig.tight_layout()
plt.show()
print("Quantile loss shows asymmetric penalties.")
Code cell 11
log_cosh = np.log(np.cosh(r))
fig, ax = plt.subplots()
ax.plot(r, log_cosh, label="log-cosh", color=COLORS["primary"])
ax.plot(r, huber, label="Huber delta=1", color=COLORS["secondary"], linestyle="--")
ax.set_title("Smooth robust losses")
ax.set_xlabel("Residual $r$")
ax.set_ylabel("Loss")
ax.legend()
fig.tight_layout()
plt.show()
print("log-cosh behaves quadratically near zero and linearly in the tails.")
3. Binary classification from logits
Code cell 13
def bce_probability(p, y):
eps = 1e-12
p = np.clip(p, eps, 1 - eps)
return -(y * np.log(p) + (1 - y) * np.log(1 - p))
def bce_logits(z, y):
return np.maximum(z, 0) - z * y + np.log1p(np.exp(-np.abs(z)))
z = np.array([-8.0, -1.0, 0.0, 1.0, 8.0])
y = np.array([0, 0, 1, 1, 1])
p = sigmoid(z)
loss_prob = bce_probability(p, y)
loss_logits = bce_logits(z, y)
print("Prob-space BCE:", np.round(loss_prob, 6))
print("Logit-space BCE:", np.round(loss_logits, 6))
check_close("probability and logit BCE match", loss_prob, loss_logits, tol=1e-10)
Code cell 14
z_grid = np.linspace(-10, 10, 400)
loss_y0 = bce_logits(z_grid, 0)
loss_y1 = bce_logits(z_grid, 1)
fig, ax = plt.subplots()
ax.plot(z_grid, loss_y0, label="target y=0", color=COLORS["primary"])
ax.plot(z_grid, loss_y1, label="target y=1", color=COLORS["secondary"])
ax.set_title("Binary cross-entropy from logits")
ax.set_xlabel("Logit $z$")
ax.set_ylabel("Loss")
ax.legend()
fig.tight_layout()
plt.show()
print("BCE loss penalizes confident wrong logits sharply.")
Code cell 15
grad_bce = sigmoid(z_grid) - 1
fig, ax = plt.subplots()
ax.plot(z_grid, grad_bce, label="gradient for y=1", color=COLORS["primary"])
ax.axhline(0, color=COLORS["neutral"], linestyle="--", label="zero")
ax.set_title("BCE logit gradient is sigmoid(z)-y")
ax.set_xlabel("Logit $z$")
ax.set_ylabel("Gradient")
ax.legend()
fig.tight_layout()
plt.show()
print("BCE gradient demonstration complete.")
4. Multiclass CE and log-sum-exp
Code cell 17
logits = np.array([[3.0, 1.0, -2.0], [1000.0, 999.0, 998.0], [-4.0, 2.0, 1.0]])
targets = np.array([0, 2, 1])
def cross_entropy_logits(logits, targets):
lse = logsumexp(logits, axis=1)
return -logits[np.arange(len(targets)), targets] + lse
ce = cross_entropy_logits(logits, targets)
print("Stable CE:", np.round(ce, 6))
check_true("finite CE for huge logits", np.isfinite(ce).all())
Code cell 18
probs = softmax(logits, axis=1)
one_hot = np.eye(3)[targets]
grad = probs - one_hot
print("Softmax probabilities:\n", np.round(probs, 4))
print("CE gradient probs - one_hot:\n", np.round(grad, 4))
print("Row sums of gradient:", np.round(grad.sum(axis=1), 12))
check_close("each CE logit gradient row sums to zero", grad.sum(axis=1), np.zeros(3))
Code cell 19
z1 = np.linspace(-4, 4, 160)
z2 = np.linspace(-4, 4, 160)
Z1, Z2 = np.meshgrid(z1, z2)
logit_grid = np.stack([Z1, Z2, np.zeros_like(Z1)], axis=-1)
target_grid = np.zeros(Z1.size, dtype=int)
CE_grid = cross_entropy_logits(logit_grid.reshape(-1, 3), target_grid).reshape(Z1.shape)
fig, ax = plt.subplots(figsize=(8, 7))
im = ax.contourf(Z1, Z2, CE_grid, levels=40, cmap="plasma")
fig.colorbar(im, ax=ax, label="CE for class 0")
ax.set_title("Cross-entropy landscape over two logits")
ax.set_xlabel("Logit $z_1$")
ax.set_ylabel("Logit $z_2$")
fig.tight_layout()
plt.show()
print("CE landscape plotted.")
5. Hinge, focal, and label smoothing
Code cell 21
margin = np.linspace(-3, 3, 400) # y*s
hinge = np.maximum(0, 1 - margin)
logistic = np.log1p(np.exp(-margin))
fig, ax = plt.subplots()
ax.plot(margin, hinge, label="hinge", color=COLORS["primary"])
ax.plot(margin, logistic, label="logistic", color=COLORS["secondary"])
ax.axvline(1, linestyle="--", color=COLORS["neutral"], label="margin=1")
ax.set_title("Margin losses")
ax.set_xlabel("Signed margin $ys$")
ax.set_ylabel("Loss")
ax.legend()
fig.tight_layout()
plt.show()
print("Hinge has zero loss beyond the margin.")
Code cell 22
pt = np.linspace(0.001, 0.999, 400)
for gamma, color in zip([0, 1, 2, 5], [COLORS["primary"], COLORS["secondary"], COLORS["tertiary"], COLORS["highlight"]]):
focal = -((1 - pt) ** gamma) * np.log(pt)
plt.plot(pt, focal, label=f"gamma={gamma}", color=color)
fig = plt.gcf()
ax = plt.gca()
ax.set_title("Focal loss downweights easy examples")
ax.set_xlabel("Correct-class probability $p_t$")
ax.set_ylabel("Loss")
ax.legend()
fig.tight_layout()
plt.show()
print("Focal loss comparison complete.")
Code cell 23
C = 5
eps = 0.1
hard = np.eye(C)[2]
smooth = (1 - eps) * hard + eps * np.ones(C) / C
print("Hard target:", hard)
print("Smoothed target:", smooth)
print("Target sums:", hard.sum(), smooth.sum())
check_close("smoothed target sums to 1", smooth.sum(), 1.0)
6. Masked and weighted losses
Code cell 25
token_losses = np.array([[0.2, 0.7, 0.0, 0.0],
[0.4, 0.6, 0.9, 0.0]])
mask = np.array([[1, 1, 0, 0],
[1, 1, 1, 0]])
masked_mean = (token_losses * mask).sum() / mask.sum()
batch_mean_wrong = (token_losses * mask).sum() / token_losses.shape[0]
print("Masked token mean:", masked_mean)
print("Wrong batch denominator:", batch_mean_wrong)
check_true("denominators change the objective", abs(masked_mean - batch_mean_wrong) > 1e-6)
Code cell 26
losses = np.array([0.1, 0.4, 2.0, 0.3])
weights = np.array([1.0, 1.0, 5.0, 1.0])
weighted_mean = np.sum(weights * losses) / np.sum(weights)
plain_mean = np.mean(losses)
print("Plain mean:", plain_mean)
print("Weighted mean:", weighted_mean)
check_true("rare or important examples can dominate", weighted_mean > plain_mean)
7. Contrastive losses
Code cell 28
def cosine_similarity_matrix(A, B):
A = A / np.linalg.norm(A, axis=1, keepdims=True)
B = B / np.linalg.norm(B, axis=1, keepdims=True)
return A @ B.T
queries = np.random.normal(size=(6, 4))
keys = queries + 0.25 * np.random.normal(size=(6, 4))
sim = cosine_similarity_matrix(queries, keys)
labels = np.arange(6)
info_nce = cross_entropy_logits(sim / 0.2, labels)
print("Similarity matrix:\n", np.round(sim, 3))
print("InfoNCE losses:", np.round(info_nce, 4))
print("Mean InfoNCE:", np.mean(info_nce))
Code cell 29
fig, ax = plt.subplots(figsize=(7, 6))
im = ax.imshow(sim, cmap="viridis", aspect="auto")
fig.colorbar(im, ax=ax, label="Cosine similarity")
ax.set_title("Positive pairs sit on the diagonal")
ax.set_xlabel("Key index")
ax.set_ylabel("Query index")
fig.tight_layout()
plt.show()
print("Contrastive similarity heatmap plotted.")
Code cell 30
for tau in [1.0, 0.5, 0.2, 0.1]:
probs_tau = softmax(sim / tau, axis=1)
diagonal_mass = np.mean(np.diag(probs_tau))
print(f"tau={tau:0.1f}, mean positive probability={diagonal_mass:0.4f}")
8. Triplet and ranking losses
Code cell 32
anchor = np.array([0.0, 0.0])
positive = np.array([0.5, 0.2])
negative = np.array([1.5, 1.0])
margin = 0.5
d_pos = np.sum((anchor - positive) ** 2)
d_neg = np.sum((anchor - negative) ** 2)
triplet = max(0.0, d_pos - d_neg + margin)
print("d(anchor, positive):", d_pos)
print("d(anchor, negative):", d_neg)
print("Triplet loss:", triplet)
check_close("easy triplet has zero loss", triplet, 0.0)
Code cell 33
s_a = np.array([3.0, 0.2, 1.5])
s_b = np.array([1.0, 1.2, 1.0])
y_pref = np.array([1, 1, -1])
rank_loss = np.maximum(0, 1.0 - y_pref * (s_a - s_b))
print("Margin ranking losses:", rank_loss)
print("Mean ranking loss:", rank_loss.mean())
Code cell 34
def dpo_loss(logp_win, logp_lose, ref_win, ref_lose, beta=0.1):
advantage = beta * ((logp_win - ref_win) - (logp_lose - ref_lose))
return np.logaddexp(0.0, -advantage)
loss_pref = dpo_loss(np.array([-2.0, -1.0]), np.array([-3.0, -0.5]), np.array([-2.5, -1.2]), np.array([-2.6, -0.8]))
print("DPO-style pair losses:", np.round(loss_pref, 6))
check_true("preference loss is finite", np.isfinite(loss_pref).all())
9. Loss balancing
Code cell 36
loss_cls = np.array([0.5, 0.4, 0.6, 0.7])
loss_box = np.array([0.02, 0.03, 0.05, 0.04])
for lam in [1, 5, 10, 20]:
total = loss_cls.mean() + lam * loss_box.mean()
frac_box = lam * loss_box.mean() / total
print(f"lambda_box={lam:2d}, total={total:.4f}, box contribution={frac_box:.2%}")
Code cell 37
terms = {"classification": 0.55, "box": 0.04, "contrastive": 1.80}
scales = {"classification": 1.0, "box": 10.0, "contrastive": 0.2}
total = sum(scales[k] * v for k, v in terms.items())
for k in terms:
contribution = scales[k] * terms[k] / total
print(f"{k:>14}: weighted contribution={contribution:.2%}")
check_close("contributions sum to one", sum(scales[k] * terms[k] / total for k in terms), 1.0)
10. Summary checks
Code cell 39
checks = []
checks.append(check_true("MSE gradient grows with residual", abs(2 * 5) > abs(np.sign(5))))
checks.append(check_true("Huber clips large residual gradient", abs(huber_grad[0]) <= delta + 1e-12))
checks.append(check_true("stable CE is finite", np.isfinite(ce).all()))
checks.append(check_true("masked mean uses valid denominator", mask.sum() == 5))
print(f"Passed {sum(checks)}/{len(checks)} summary checks.")
11. Reduction scale and gradient accumulation
Code cell 41
per_example = np.array([0.2, 0.4, 0.6, 0.8])
sum_reduction = per_example.sum()
mean_reduction = per_example.mean()
print("Sum reduction:", sum_reduction)
print("Mean reduction:", mean_reduction)
print("Gradient scale ratio sum/mean:", sum_reduction / mean_reduction)
check_close("sum equals batch_size times mean", sum_reduction, len(per_example) * mean_reduction)
12. Gaussian NLL with learned variance
Code cell 43
y = np.array([0.0, 1.0, 2.0])
mu = np.array([0.1, 0.8, 2.5])
log_sigma = np.array([-1.0, 0.0, 0.5])
sigma2 = np.exp(2 * log_sigma)
gaussian_nll = 0.5 * ((y - mu) ** 2 / sigma2 + 2 * log_sigma + np.log(2 * np.pi))
print("Gaussian NLL per example:", np.round(gaussian_nll, 4))
check_true("Gaussian NLL finite", np.isfinite(gaussian_nll).all())
13. Class imbalance and positive weighting
Code cell 45
logits_binary = np.array([-2.0, -1.0, 0.5, 1.0, 2.0])
y_binary = np.array([0, 0, 0, 1, 1])
base = bce_logits(logits_binary, y_binary)
pos_weight = 4.0
weighted = np.where(y_binary == 1, pos_weight * base, base)
print("Base BCE:", np.round(base, 4))
print("Positive-weighted BCE:", np.round(weighted, 4))
print("Mean base:", base.mean(), "mean weighted:", weighted.mean())
check_true("positive weighting increases positive contribution", weighted[y_binary == 1].sum() > base[y_binary == 1].sum())
14. Proper scoring intuition
Code cell 47
true_p = 0.7
reports = np.linspace(0.01, 0.99, 200)
expected_log_loss = -(true_p * np.log(reports) + (1 - true_p) * np.log(1 - reports))
best_report = reports[np.argmin(expected_log_loss)]
print("True probability:", true_p)
print("Best report under expected log loss:", round(float(best_report), 3))
check_true("log loss minimized near true probability", abs(best_report - true_p) < 0.01)
15. Temperature changes CE gradients
Code cell 49
base_logits = np.array([[2.0, 1.0, 0.0]])
target = np.array([0])
for temp in [0.5, 1.0, 2.0]:
probs_temp = softmax(base_logits / temp, axis=1)
grad_temp = (probs_temp - np.eye(3)[target]) / temp
print(f"temperature={temp}: probs={np.round(probs_temp, 4)}, grad_norm={np.linalg.norm(grad_temp):.4f}")
16. Label smoothing changes the target vector
Code cell 51
logit = np.array([[3.0, 0.0, -1.0, -2.0]])
hard_target = np.array([[1.0, 0.0, 0.0, 0.0]])
for eps in [0.0, 0.1, 0.3]:
target = (1 - eps) * hard_target + eps / hard_target.shape[1]
grad = softmax(logit, axis=1) - target
print(f"epsilon={eps}: target={np.round(target, 3)}, grad={np.round(grad, 3)}")
17. Negative sampling as a loss-design choice
Code cell 53
pos_score = 2.0
neg_scores_easy = np.array([-3.0, -2.5, -2.0])
neg_scores_hard = np.array([1.8, 1.6, 1.4])
loss_easy = -pos_score + logsumexp(np.r_[pos_score, neg_scores_easy], axis=0)
loss_hard = -pos_score + logsumexp(np.r_[pos_score, neg_scores_hard], axis=0)
print("InfoNCE with easy negatives:", loss_easy)
print("InfoNCE with hard negatives:", loss_hard)
check_true("hard negatives increase contrastive loss", loss_hard > loss_easy)