Theory Notebook

2 min read18 headings

Theory Notebook

Converted from theory.ipynb for web reading.

Loss Functions - Theory Notebook

This notebook is the executable companion to notes.md. It turns loss formulas into curves, gradients, masking rules, and stability checks.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

Code cell 4

def sigmoid(z):
    z = np.asarray(z, dtype=float)
    return 1.0 / (1.0 + np.exp(-z))


def softmax(logits, axis=-1):
    logits = np.asarray(logits, dtype=float)
    shifted = logits - np.max(logits, axis=axis, keepdims=True)
    exp_shifted = np.exp(shifted)
    return exp_shifted / np.sum(exp_shifted, axis=axis, keepdims=True)


def logsumexp(logits, axis=-1, keepdims=False):
    logits = np.asarray(logits, dtype=float)
    m = np.max(logits, axis=axis, keepdims=True)
    out = m + np.log(np.sum(np.exp(logits - m), axis=axis, keepdims=True))
    if not keepdims:
        out = np.squeeze(out, axis=axis)
    return out


def check_close(name, value, expected, tol=1e-8):
    ok = np.allclose(value, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}: value={value}, expected={expected}")
    return ok


def check_true(name, condition):
    ok = bool(condition)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    return ok


print("Helper functions ready.")

1. Loss as a training signal

Code cell 6

y_true = np.array([2.0, -1.0, 0.5, 3.0])
y_pred = np.array([1.5, -0.5, 1.0, 10.0])
residual = y_pred - y_true
print("Residuals:", residual)
print("Large residual dominates MSE:", residual[-1] ** 2)
print("Large residual has linear MAE penalty:", abs(residual[-1]))

Code cell 7

r = np.linspace(-5, 5, 401)
mse = r ** 2
mae = np.abs(r)
delta = 1.0
huber = np.where(np.abs(r) <= delta, 0.5 * r**2, delta * (np.abs(r) - 0.5 * delta))
fig, ax = plt.subplots()
ax.plot(r, mse, label="MSE", color=COLORS["primary"])
ax.plot(r, mae, label="MAE", color=COLORS["secondary"])
ax.plot(r, huber, label="Huber delta=1", color=COLORS["tertiary"])
ax.set_title("Regression loss curves")
ax.set_xlabel("Residual $r$")
ax.set_ylabel("Loss")
ax.legend()
fig.tight_layout()
plt.show()
print("Plotted MSE, MAE, and Huber loss.")

2. Regression gradients

Code cell 9

mse_grad = 2 * r
mae_grad = np.sign(r)
huber_grad = np.where(np.abs(r) <= delta, r, delta * np.sign(r))
fig, ax = plt.subplots()
ax.plot(r, mse_grad, label="MSE gradient", color=COLORS["primary"])
ax.plot(r, mae_grad, label="MAE subgradient", color=COLORS["secondary"])
ax.plot(r, huber_grad, label="Huber gradient", color=COLORS["tertiary"])
ax.set_title("Gradient scale differs by loss")
ax.set_xlabel("Residual $r$")
ax.set_ylabel("Gradient with respect to prediction")
ax.legend()
fig.tight_layout()
plt.show()
print("Gradient scale comparison complete.")

Code cell 10

def quantile_loss(y, q, tau):
    residual = y - q
    return np.maximum(tau * residual, (tau - 1) * residual)

taus = [0.1, 0.5, 0.9]
fig, ax = plt.subplots()
for tau, color in zip(taus, [COLORS["primary"], COLORS["secondary"], COLORS["tertiary"]]):
    ax.plot(r, quantile_loss(r, 0.0, tau), label=f"tau={tau}", color=color)
ax.set_title("Quantile loss is asymmetric")
ax.set_xlabel("Target residual $y-q$")
ax.set_ylabel("Loss")
ax.legend()
fig.tight_layout()
plt.show()
print("Quantile loss shows asymmetric penalties.")

Code cell 11

log_cosh = np.log(np.cosh(r))
fig, ax = plt.subplots()
ax.plot(r, log_cosh, label="log-cosh", color=COLORS["primary"])
ax.plot(r, huber, label="Huber delta=1", color=COLORS["secondary"], linestyle="--")
ax.set_title("Smooth robust losses")
ax.set_xlabel("Residual $r$")
ax.set_ylabel("Loss")
ax.legend()
fig.tight_layout()
plt.show()
print("log-cosh behaves quadratically near zero and linearly in the tails.")

3. Binary classification from logits

Code cell 13

def bce_probability(p, y):
    eps = 1e-12
    p = np.clip(p, eps, 1 - eps)
    return -(y * np.log(p) + (1 - y) * np.log(1 - p))

def bce_logits(z, y):
    return np.maximum(z, 0) - z * y + np.log1p(np.exp(-np.abs(z)))

z = np.array([-8.0, -1.0, 0.0, 1.0, 8.0])
y = np.array([0, 0, 1, 1, 1])
p = sigmoid(z)
loss_prob = bce_probability(p, y)
loss_logits = bce_logits(z, y)
print("Prob-space BCE:", np.round(loss_prob, 6))
print("Logit-space BCE:", np.round(loss_logits, 6))
check_close("probability and logit BCE match", loss_prob, loss_logits, tol=1e-10)

Code cell 14

z_grid = np.linspace(-10, 10, 400)
loss_y0 = bce_logits(z_grid, 0)
loss_y1 = bce_logits(z_grid, 1)
fig, ax = plt.subplots()
ax.plot(z_grid, loss_y0, label="target y=0", color=COLORS["primary"])
ax.plot(z_grid, loss_y1, label="target y=1", color=COLORS["secondary"])
ax.set_title("Binary cross-entropy from logits")
ax.set_xlabel("Logit $z$")
ax.set_ylabel("Loss")
ax.legend()
fig.tight_layout()
plt.show()
print("BCE loss penalizes confident wrong logits sharply.")

Code cell 15

grad_bce = sigmoid(z_grid) - 1
fig, ax = plt.subplots()
ax.plot(z_grid, grad_bce, label="gradient for y=1", color=COLORS["primary"])
ax.axhline(0, color=COLORS["neutral"], linestyle="--", label="zero")
ax.set_title("BCE logit gradient is sigmoid(z)-y")
ax.set_xlabel("Logit $z$")
ax.set_ylabel("Gradient")
ax.legend()
fig.tight_layout()
plt.show()
print("BCE gradient demonstration complete.")

4. Multiclass CE and log-sum-exp

Code cell 17

logits = np.array([[3.0, 1.0, -2.0], [1000.0, 999.0, 998.0], [-4.0, 2.0, 1.0]])
targets = np.array([0, 2, 1])

def cross_entropy_logits(logits, targets):
    lse = logsumexp(logits, axis=1)
    return -logits[np.arange(len(targets)), targets] + lse

ce = cross_entropy_logits(logits, targets)
print("Stable CE:", np.round(ce, 6))
check_true("finite CE for huge logits", np.isfinite(ce).all())

Code cell 18

probs = softmax(logits, axis=1)
one_hot = np.eye(3)[targets]
grad = probs - one_hot
print("Softmax probabilities:\n", np.round(probs, 4))
print("CE gradient probs - one_hot:\n", np.round(grad, 4))
print("Row sums of gradient:", np.round(grad.sum(axis=1), 12))
check_close("each CE logit gradient row sums to zero", grad.sum(axis=1), np.zeros(3))

Code cell 19

z1 = np.linspace(-4, 4, 160)
z2 = np.linspace(-4, 4, 160)
Z1, Z2 = np.meshgrid(z1, z2)
logit_grid = np.stack([Z1, Z2, np.zeros_like(Z1)], axis=-1)
target_grid = np.zeros(Z1.size, dtype=int)
CE_grid = cross_entropy_logits(logit_grid.reshape(-1, 3), target_grid).reshape(Z1.shape)
fig, ax = plt.subplots(figsize=(8, 7))
im = ax.contourf(Z1, Z2, CE_grid, levels=40, cmap="plasma")
fig.colorbar(im, ax=ax, label="CE for class 0")
ax.set_title("Cross-entropy landscape over two logits")
ax.set_xlabel("Logit $z_1$")
ax.set_ylabel("Logit $z_2$")
fig.tight_layout()
plt.show()
print("CE landscape plotted.")

5. Hinge, focal, and label smoothing

Code cell 21

margin = np.linspace(-3, 3, 400)  # y*s
hinge = np.maximum(0, 1 - margin)
logistic = np.log1p(np.exp(-margin))
fig, ax = plt.subplots()
ax.plot(margin, hinge, label="hinge", color=COLORS["primary"])
ax.plot(margin, logistic, label="logistic", color=COLORS["secondary"])
ax.axvline(1, linestyle="--", color=COLORS["neutral"], label="margin=1")
ax.set_title("Margin losses")
ax.set_xlabel("Signed margin $ys$")
ax.set_ylabel("Loss")
ax.legend()
fig.tight_layout()
plt.show()
print("Hinge has zero loss beyond the margin.")

Code cell 22

pt = np.linspace(0.001, 0.999, 400)
for gamma, color in zip([0, 1, 2, 5], [COLORS["primary"], COLORS["secondary"], COLORS["tertiary"], COLORS["highlight"]]):
    focal = -((1 - pt) ** gamma) * np.log(pt)
    plt.plot(pt, focal, label=f"gamma={gamma}", color=color)
fig = plt.gcf()
ax = plt.gca()
ax.set_title("Focal loss downweights easy examples")
ax.set_xlabel("Correct-class probability $p_t$")
ax.set_ylabel("Loss")
ax.legend()
fig.tight_layout()
plt.show()
print("Focal loss comparison complete.")

Code cell 23

C = 5
eps = 0.1
hard = np.eye(C)[2]
smooth = (1 - eps) * hard + eps * np.ones(C) / C
print("Hard target:", hard)
print("Smoothed target:", smooth)
print("Target sums:", hard.sum(), smooth.sum())
check_close("smoothed target sums to 1", smooth.sum(), 1.0)

6. Masked and weighted losses

Code cell 25

token_losses = np.array([[0.2, 0.7, 0.0, 0.0],
                         [0.4, 0.6, 0.9, 0.0]])
mask = np.array([[1, 1, 0, 0],
                 [1, 1, 1, 0]])
masked_mean = (token_losses * mask).sum() / mask.sum()
batch_mean_wrong = (token_losses * mask).sum() / token_losses.shape[0]
print("Masked token mean:", masked_mean)
print("Wrong batch denominator:", batch_mean_wrong)
check_true("denominators change the objective", abs(masked_mean - batch_mean_wrong) > 1e-6)

Code cell 26

losses = np.array([0.1, 0.4, 2.0, 0.3])
weights = np.array([1.0, 1.0, 5.0, 1.0])
weighted_mean = np.sum(weights * losses) / np.sum(weights)
plain_mean = np.mean(losses)
print("Plain mean:", plain_mean)
print("Weighted mean:", weighted_mean)
check_true("rare or important examples can dominate", weighted_mean > plain_mean)

7. Contrastive losses

Code cell 28

def cosine_similarity_matrix(A, B):
    A = A / np.linalg.norm(A, axis=1, keepdims=True)
    B = B / np.linalg.norm(B, axis=1, keepdims=True)
    return A @ B.T

queries = np.random.normal(size=(6, 4))
keys = queries + 0.25 * np.random.normal(size=(6, 4))
sim = cosine_similarity_matrix(queries, keys)
labels = np.arange(6)
info_nce = cross_entropy_logits(sim / 0.2, labels)
print("Similarity matrix:\n", np.round(sim, 3))
print("InfoNCE losses:", np.round(info_nce, 4))
print("Mean InfoNCE:", np.mean(info_nce))

Code cell 29

fig, ax = plt.subplots(figsize=(7, 6))
im = ax.imshow(sim, cmap="viridis", aspect="auto")
fig.colorbar(im, ax=ax, label="Cosine similarity")
ax.set_title("Positive pairs sit on the diagonal")
ax.set_xlabel("Key index")
ax.set_ylabel("Query index")
fig.tight_layout()
plt.show()
print("Contrastive similarity heatmap plotted.")

Code cell 30

for tau in [1.0, 0.5, 0.2, 0.1]:
    probs_tau = softmax(sim / tau, axis=1)
    diagonal_mass = np.mean(np.diag(probs_tau))
    print(f"tau={tau:0.1f}, mean positive probability={diagonal_mass:0.4f}")

8. Triplet and ranking losses

Code cell 32

anchor = np.array([0.0, 0.0])
positive = np.array([0.5, 0.2])
negative = np.array([1.5, 1.0])
margin = 0.5
d_pos = np.sum((anchor - positive) ** 2)
d_neg = np.sum((anchor - negative) ** 2)
triplet = max(0.0, d_pos - d_neg + margin)
print("d(anchor, positive):", d_pos)
print("d(anchor, negative):", d_neg)
print("Triplet loss:", triplet)
check_close("easy triplet has zero loss", triplet, 0.0)

Code cell 33

s_a = np.array([3.0, 0.2, 1.5])
s_b = np.array([1.0, 1.2, 1.0])
y_pref = np.array([1, 1, -1])
rank_loss = np.maximum(0, 1.0 - y_pref * (s_a - s_b))
print("Margin ranking losses:", rank_loss)
print("Mean ranking loss:", rank_loss.mean())

Code cell 34

def dpo_loss(logp_win, logp_lose, ref_win, ref_lose, beta=0.1):
    advantage = beta * ((logp_win - ref_win) - (logp_lose - ref_lose))
    return np.logaddexp(0.0, -advantage)

loss_pref = dpo_loss(np.array([-2.0, -1.0]), np.array([-3.0, -0.5]), np.array([-2.5, -1.2]), np.array([-2.6, -0.8]))
print("DPO-style pair losses:", np.round(loss_pref, 6))
check_true("preference loss is finite", np.isfinite(loss_pref).all())

9. Loss balancing

Code cell 36

loss_cls = np.array([0.5, 0.4, 0.6, 0.7])
loss_box = np.array([0.02, 0.03, 0.05, 0.04])
for lam in [1, 5, 10, 20]:
    total = loss_cls.mean() + lam * loss_box.mean()
    frac_box = lam * loss_box.mean() / total
    print(f"lambda_box={lam:2d}, total={total:.4f}, box contribution={frac_box:.2%}")

Code cell 37

terms = {"classification": 0.55, "box": 0.04, "contrastive": 1.80}
scales = {"classification": 1.0, "box": 10.0, "contrastive": 0.2}
total = sum(scales[k] * v for k, v in terms.items())
for k in terms:
    contribution = scales[k] * terms[k] / total
    print(f"{k:>14}: weighted contribution={contribution:.2%}")
check_close("contributions sum to one", sum(scales[k] * terms[k] / total for k in terms), 1.0)

10. Summary checks

Code cell 39

checks = []
checks.append(check_true("MSE gradient grows with residual", abs(2 * 5) > abs(np.sign(5))))
checks.append(check_true("Huber clips large residual gradient", abs(huber_grad[0]) <= delta + 1e-12))
checks.append(check_true("stable CE is finite", np.isfinite(ce).all()))
checks.append(check_true("masked mean uses valid denominator", mask.sum() == 5))
print(f"Passed {sum(checks)}/{len(checks)} summary checks.")

11. Reduction scale and gradient accumulation

Code cell 41

per_example = np.array([0.2, 0.4, 0.6, 0.8])
sum_reduction = per_example.sum()
mean_reduction = per_example.mean()
print("Sum reduction:", sum_reduction)
print("Mean reduction:", mean_reduction)
print("Gradient scale ratio sum/mean:", sum_reduction / mean_reduction)
check_close("sum equals batch_size times mean", sum_reduction, len(per_example) * mean_reduction)

12. Gaussian NLL with learned variance

Code cell 43

y = np.array([0.0, 1.0, 2.0])
mu = np.array([0.1, 0.8, 2.5])
log_sigma = np.array([-1.0, 0.0, 0.5])
sigma2 = np.exp(2 * log_sigma)
gaussian_nll = 0.5 * ((y - mu) ** 2 / sigma2 + 2 * log_sigma + np.log(2 * np.pi))
print("Gaussian NLL per example:", np.round(gaussian_nll, 4))
check_true("Gaussian NLL finite", np.isfinite(gaussian_nll).all())

13. Class imbalance and positive weighting

Code cell 45

logits_binary = np.array([-2.0, -1.0, 0.5, 1.0, 2.0])
y_binary = np.array([0, 0, 0, 1, 1])
base = bce_logits(logits_binary, y_binary)
pos_weight = 4.0
weighted = np.where(y_binary == 1, pos_weight * base, base)
print("Base BCE:", np.round(base, 4))
print("Positive-weighted BCE:", np.round(weighted, 4))
print("Mean base:", base.mean(), "mean weighted:", weighted.mean())
check_true("positive weighting increases positive contribution", weighted[y_binary == 1].sum() > base[y_binary == 1].sum())

14. Proper scoring intuition

Code cell 47

true_p = 0.7
reports = np.linspace(0.01, 0.99, 200)
expected_log_loss = -(true_p * np.log(reports) + (1 - true_p) * np.log(1 - reports))
best_report = reports[np.argmin(expected_log_loss)]
print("True probability:", true_p)
print("Best report under expected log loss:", round(float(best_report), 3))
check_true("log loss minimized near true probability", abs(best_report - true_p) < 0.01)

15. Temperature changes CE gradients

Code cell 49

base_logits = np.array([[2.0, 1.0, 0.0]])
target = np.array([0])
for temp in [0.5, 1.0, 2.0]:
    probs_temp = softmax(base_logits / temp, axis=1)
    grad_temp = (probs_temp - np.eye(3)[target]) / temp
    print(f"temperature={temp}: probs={np.round(probs_temp, 4)}, grad_norm={np.linalg.norm(grad_temp):.4f}")

16. Label smoothing changes the target vector

Code cell 51

logit = np.array([[3.0, 0.0, -1.0, -2.0]])
hard_target = np.array([[1.0, 0.0, 0.0, 0.0]])
for eps in [0.0, 0.1, 0.3]:
    target = (1 - eps) * hard_target + eps / hard_target.shape[1]
    grad = softmax(logit, axis=1) - target
    print(f"epsilon={eps}: target={np.round(target, 3)}, grad={np.round(grad, 3)}")

17. Negative sampling as a loss-design choice

Code cell 53

pos_score = 2.0
neg_scores_easy = np.array([-3.0, -2.5, -2.0])
neg_scores_hard = np.array([1.8, 1.6, 1.4])
loss_easy = -pos_score + logsumexp(np.r_[pos_score, neg_scores_easy], axis=0)
loss_hard = -pos_score + logsumexp(np.r_[pos_score, neg_scores_hard], axis=0)
print("InfoNCE with easy negatives:", loss_easy)
print("InfoNCE with hard negatives:", loss_hard)
check_true("hard negatives increase contrastive loss", loss_hard > loss_easy)

Loss Functions

Theory Notebook

Loss Functions - Theory Notebook

Code cell 2

Code cell 3

Code cell 4

1. Loss as a training signal

Code cell 6

Code cell 7

2. Regression gradients

Code cell 9

Code cell 10

Code cell 11

3. Binary classification from logits

Code cell 13

Code cell 14

Code cell 15

4. Multiclass CE and log-sum-exp

Code cell 17

Code cell 18

Code cell 19

5. Hinge, focal, and label smoothing

Code cell 21

Code cell 22

Code cell 23

6. Masked and weighted losses

Code cell 25

Code cell 26

7. Contrastive losses

Code cell 28

Code cell 29

Code cell 30

8. Triplet and ranking losses

Code cell 32

Code cell 33

Code cell 34

9. Loss balancing

Code cell 36

Code cell 37

10. Summary checks

Code cell 39

11. Reduction scale and gradient accumulation

Code cell 41

12. Gaussian NLL with learned variance

Code cell 43

13. Class imbalance and positive weighting

Code cell 45

14. Proper scoring intuition

Code cell 47

15. Temperature changes CE gradients

Code cell 49

16. Label smoothing changes the target vector

Code cell 51

17. Negative sampling as a loss-design choice

Code cell 53

Test this lesson

Which module does this lesson belong to?

Which section is covered in this lesson content?

Which term is most central to this lesson?

What is the best way to use this lesson for real learning?