Theory NotebookMath for LLMs

Loss Functions

ML Specific Math / Loss Functions

Run notebook
Private notes
0/8000

Notes stay private to your browser until account sync is configured.

Theory Notebook
2 min read18 headings

Theory Notebook

Converted from theory.ipynb for web reading.

Loss Functions - Theory Notebook

This notebook is the executable companion to notes.md. It turns loss formulas into curves, gradients, masking rules, and stability checks.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

Code cell 4

def sigmoid(z):
    z = np.asarray(z, dtype=float)
    return 1.0 / (1.0 + np.exp(-z))


def softmax(logits, axis=-1):
    logits = np.asarray(logits, dtype=float)
    shifted = logits - np.max(logits, axis=axis, keepdims=True)
    exp_shifted = np.exp(shifted)
    return exp_shifted / np.sum(exp_shifted, axis=axis, keepdims=True)


def logsumexp(logits, axis=-1, keepdims=False):
    logits = np.asarray(logits, dtype=float)
    m = np.max(logits, axis=axis, keepdims=True)
    out = m + np.log(np.sum(np.exp(logits - m), axis=axis, keepdims=True))
    if not keepdims:
        out = np.squeeze(out, axis=axis)
    return out


def check_close(name, value, expected, tol=1e-8):
    ok = np.allclose(value, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}: value={value}, expected={expected}")
    return ok


def check_true(name, condition):
    ok = bool(condition)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    return ok


print("Helper functions ready.")

1. Loss as a training signal

Code cell 6

y_true = np.array([2.0, -1.0, 0.5, 3.0])
y_pred = np.array([1.5, -0.5, 1.0, 10.0])
residual = y_pred - y_true
print("Residuals:", residual)
print("Large residual dominates MSE:", residual[-1] ** 2)
print("Large residual has linear MAE penalty:", abs(residual[-1]))

Code cell 7

r = np.linspace(-5, 5, 401)
mse = r ** 2
mae = np.abs(r)
delta = 1.0
huber = np.where(np.abs(r) <= delta, 0.5 * r**2, delta * (np.abs(r) - 0.5 * delta))
fig, ax = plt.subplots()
ax.plot(r, mse, label="MSE", color=COLORS["primary"])
ax.plot(r, mae, label="MAE", color=COLORS["secondary"])
ax.plot(r, huber, label="Huber delta=1", color=COLORS["tertiary"])
ax.set_title("Regression loss curves")
ax.set_xlabel("Residual $r$")
ax.set_ylabel("Loss")
ax.legend()
fig.tight_layout()
plt.show()
print("Plotted MSE, MAE, and Huber loss.")

2. Regression gradients

Code cell 9

mse_grad = 2 * r
mae_grad = np.sign(r)
huber_grad = np.where(np.abs(r) <= delta, r, delta * np.sign(r))
fig, ax = plt.subplots()
ax.plot(r, mse_grad, label="MSE gradient", color=COLORS["primary"])
ax.plot(r, mae_grad, label="MAE subgradient", color=COLORS["secondary"])
ax.plot(r, huber_grad, label="Huber gradient", color=COLORS["tertiary"])
ax.set_title("Gradient scale differs by loss")
ax.set_xlabel("Residual $r$")
ax.set_ylabel("Gradient with respect to prediction")
ax.legend()
fig.tight_layout()
plt.show()
print("Gradient scale comparison complete.")

Code cell 10

def quantile_loss(y, q, tau):
    residual = y - q
    return np.maximum(tau * residual, (tau - 1) * residual)

taus = [0.1, 0.5, 0.9]
fig, ax = plt.subplots()
for tau, color in zip(taus, [COLORS["primary"], COLORS["secondary"], COLORS["tertiary"]]):
    ax.plot(r, quantile_loss(r, 0.0, tau), label=f"tau={tau}", color=color)
ax.set_title("Quantile loss is asymmetric")
ax.set_xlabel("Target residual $y-q$")
ax.set_ylabel("Loss")
ax.legend()
fig.tight_layout()
plt.show()
print("Quantile loss shows asymmetric penalties.")

Code cell 11

log_cosh = np.log(np.cosh(r))
fig, ax = plt.subplots()
ax.plot(r, log_cosh, label="log-cosh", color=COLORS["primary"])
ax.plot(r, huber, label="Huber delta=1", color=COLORS["secondary"], linestyle="--")
ax.set_title("Smooth robust losses")
ax.set_xlabel("Residual $r$")
ax.set_ylabel("Loss")
ax.legend()
fig.tight_layout()
plt.show()
print("log-cosh behaves quadratically near zero and linearly in the tails.")

3. Binary classification from logits

Code cell 13

def bce_probability(p, y):
    eps = 1e-12
    p = np.clip(p, eps, 1 - eps)
    return -(y * np.log(p) + (1 - y) * np.log(1 - p))

def bce_logits(z, y):
    return np.maximum(z, 0) - z * y + np.log1p(np.exp(-np.abs(z)))

z = np.array([-8.0, -1.0, 0.0, 1.0, 8.0])
y = np.array([0, 0, 1, 1, 1])
p = sigmoid(z)
loss_prob = bce_probability(p, y)
loss_logits = bce_logits(z, y)
print("Prob-space BCE:", np.round(loss_prob, 6))
print("Logit-space BCE:", np.round(loss_logits, 6))
check_close("probability and logit BCE match", loss_prob, loss_logits, tol=1e-10)

Code cell 14

z_grid = np.linspace(-10, 10, 400)
loss_y0 = bce_logits(z_grid, 0)
loss_y1 = bce_logits(z_grid, 1)
fig, ax = plt.subplots()
ax.plot(z_grid, loss_y0, label="target y=0", color=COLORS["primary"])
ax.plot(z_grid, loss_y1, label="target y=1", color=COLORS["secondary"])
ax.set_title("Binary cross-entropy from logits")
ax.set_xlabel("Logit $z$")
ax.set_ylabel("Loss")
ax.legend()
fig.tight_layout()
plt.show()
print("BCE loss penalizes confident wrong logits sharply.")

Code cell 15

grad_bce = sigmoid(z_grid) - 1
fig, ax = plt.subplots()
ax.plot(z_grid, grad_bce, label="gradient for y=1", color=COLORS["primary"])
ax.axhline(0, color=COLORS["neutral"], linestyle="--", label="zero")
ax.set_title("BCE logit gradient is sigmoid(z)-y")
ax.set_xlabel("Logit $z$")
ax.set_ylabel("Gradient")
ax.legend()
fig.tight_layout()
plt.show()
print("BCE gradient demonstration complete.")

4. Multiclass CE and log-sum-exp

Code cell 17

logits = np.array([[3.0, 1.0, -2.0], [1000.0, 999.0, 998.0], [-4.0, 2.0, 1.0]])
targets = np.array([0, 2, 1])

def cross_entropy_logits(logits, targets):
    lse = logsumexp(logits, axis=1)
    return -logits[np.arange(len(targets)), targets] + lse

ce = cross_entropy_logits(logits, targets)
print("Stable CE:", np.round(ce, 6))
check_true("finite CE for huge logits", np.isfinite(ce).all())

Code cell 18

probs = softmax(logits, axis=1)
one_hot = np.eye(3)[targets]
grad = probs - one_hot
print("Softmax probabilities:\n", np.round(probs, 4))
print("CE gradient probs - one_hot:\n", np.round(grad, 4))
print("Row sums of gradient:", np.round(grad.sum(axis=1), 12))
check_close("each CE logit gradient row sums to zero", grad.sum(axis=1), np.zeros(3))

Code cell 19

z1 = np.linspace(-4, 4, 160)
z2 = np.linspace(-4, 4, 160)
Z1, Z2 = np.meshgrid(z1, z2)
logit_grid = np.stack([Z1, Z2, np.zeros_like(Z1)], axis=-1)
target_grid = np.zeros(Z1.size, dtype=int)
CE_grid = cross_entropy_logits(logit_grid.reshape(-1, 3), target_grid).reshape(Z1.shape)
fig, ax = plt.subplots(figsize=(8, 7))
im = ax.contourf(Z1, Z2, CE_grid, levels=40, cmap="plasma")
fig.colorbar(im, ax=ax, label="CE for class 0")
ax.set_title("Cross-entropy landscape over two logits")
ax.set_xlabel("Logit $z_1$")
ax.set_ylabel("Logit $z_2$")
fig.tight_layout()
plt.show()
print("CE landscape plotted.")

5. Hinge, focal, and label smoothing

Code cell 21

margin = np.linspace(-3, 3, 400)  # y*s
hinge = np.maximum(0, 1 - margin)
logistic = np.log1p(np.exp(-margin))
fig, ax = plt.subplots()
ax.plot(margin, hinge, label="hinge", color=COLORS["primary"])
ax.plot(margin, logistic, label="logistic", color=COLORS["secondary"])
ax.axvline(1, linestyle="--", color=COLORS["neutral"], label="margin=1")
ax.set_title("Margin losses")
ax.set_xlabel("Signed margin $ys$")
ax.set_ylabel("Loss")
ax.legend()
fig.tight_layout()
plt.show()
print("Hinge has zero loss beyond the margin.")

Code cell 22

pt = np.linspace(0.001, 0.999, 400)
for gamma, color in zip([0, 1, 2, 5], [COLORS["primary"], COLORS["secondary"], COLORS["tertiary"], COLORS["highlight"]]):
    focal = -((1 - pt) ** gamma) * np.log(pt)
    plt.plot(pt, focal, label=f"gamma={gamma}", color=color)
fig = plt.gcf()
ax = plt.gca()
ax.set_title("Focal loss downweights easy examples")
ax.set_xlabel("Correct-class probability $p_t$")
ax.set_ylabel("Loss")
ax.legend()
fig.tight_layout()
plt.show()
print("Focal loss comparison complete.")

Code cell 23

C = 5
eps = 0.1
hard = np.eye(C)[2]
smooth = (1 - eps) * hard + eps * np.ones(C) / C
print("Hard target:", hard)
print("Smoothed target:", smooth)
print("Target sums:", hard.sum(), smooth.sum())
check_close("smoothed target sums to 1", smooth.sum(), 1.0)

6. Masked and weighted losses

Code cell 25

token_losses = np.array([[0.2, 0.7, 0.0, 0.0],
                         [0.4, 0.6, 0.9, 0.0]])
mask = np.array([[1, 1, 0, 0],
                 [1, 1, 1, 0]])
masked_mean = (token_losses * mask).sum() / mask.sum()
batch_mean_wrong = (token_losses * mask).sum() / token_losses.shape[0]
print("Masked token mean:", masked_mean)
print("Wrong batch denominator:", batch_mean_wrong)
check_true("denominators change the objective", abs(masked_mean - batch_mean_wrong) > 1e-6)

Code cell 26

losses = np.array([0.1, 0.4, 2.0, 0.3])
weights = np.array([1.0, 1.0, 5.0, 1.0])
weighted_mean = np.sum(weights * losses) / np.sum(weights)
plain_mean = np.mean(losses)
print("Plain mean:", plain_mean)
print("Weighted mean:", weighted_mean)
check_true("rare or important examples can dominate", weighted_mean > plain_mean)

7. Contrastive losses

Code cell 28

def cosine_similarity_matrix(A, B):
    A = A / np.linalg.norm(A, axis=1, keepdims=True)
    B = B / np.linalg.norm(B, axis=1, keepdims=True)
    return A @ B.T

queries = np.random.normal(size=(6, 4))
keys = queries + 0.25 * np.random.normal(size=(6, 4))
sim = cosine_similarity_matrix(queries, keys)
labels = np.arange(6)
info_nce = cross_entropy_logits(sim / 0.2, labels)
print("Similarity matrix:\n", np.round(sim, 3))
print("InfoNCE losses:", np.round(info_nce, 4))
print("Mean InfoNCE:", np.mean(info_nce))

Code cell 29

fig, ax = plt.subplots(figsize=(7, 6))
im = ax.imshow(sim, cmap="viridis", aspect="auto")
fig.colorbar(im, ax=ax, label="Cosine similarity")
ax.set_title("Positive pairs sit on the diagonal")
ax.set_xlabel("Key index")
ax.set_ylabel("Query index")
fig.tight_layout()
plt.show()
print("Contrastive similarity heatmap plotted.")

Code cell 30

for tau in [1.0, 0.5, 0.2, 0.1]:
    probs_tau = softmax(sim / tau, axis=1)
    diagonal_mass = np.mean(np.diag(probs_tau))
    print(f"tau={tau:0.1f}, mean positive probability={diagonal_mass:0.4f}")

8. Triplet and ranking losses

Code cell 32

anchor = np.array([0.0, 0.0])
positive = np.array([0.5, 0.2])
negative = np.array([1.5, 1.0])
margin = 0.5
d_pos = np.sum((anchor - positive) ** 2)
d_neg = np.sum((anchor - negative) ** 2)
triplet = max(0.0, d_pos - d_neg + margin)
print("d(anchor, positive):", d_pos)
print("d(anchor, negative):", d_neg)
print("Triplet loss:", triplet)
check_close("easy triplet has zero loss", triplet, 0.0)

Code cell 33

s_a = np.array([3.0, 0.2, 1.5])
s_b = np.array([1.0, 1.2, 1.0])
y_pref = np.array([1, 1, -1])
rank_loss = np.maximum(0, 1.0 - y_pref * (s_a - s_b))
print("Margin ranking losses:", rank_loss)
print("Mean ranking loss:", rank_loss.mean())

Code cell 34

def dpo_loss(logp_win, logp_lose, ref_win, ref_lose, beta=0.1):
    advantage = beta * ((logp_win - ref_win) - (logp_lose - ref_lose))
    return np.logaddexp(0.0, -advantage)

loss_pref = dpo_loss(np.array([-2.0, -1.0]), np.array([-3.0, -0.5]), np.array([-2.5, -1.2]), np.array([-2.6, -0.8]))
print("DPO-style pair losses:", np.round(loss_pref, 6))
check_true("preference loss is finite", np.isfinite(loss_pref).all())

9. Loss balancing

Code cell 36

loss_cls = np.array([0.5, 0.4, 0.6, 0.7])
loss_box = np.array([0.02, 0.03, 0.05, 0.04])
for lam in [1, 5, 10, 20]:
    total = loss_cls.mean() + lam * loss_box.mean()
    frac_box = lam * loss_box.mean() / total
    print(f"lambda_box={lam:2d}, total={total:.4f}, box contribution={frac_box:.2%}")

Code cell 37

terms = {"classification": 0.55, "box": 0.04, "contrastive": 1.80}
scales = {"classification": 1.0, "box": 10.0, "contrastive": 0.2}
total = sum(scales[k] * v for k, v in terms.items())
for k in terms:
    contribution = scales[k] * terms[k] / total
    print(f"{k:>14}: weighted contribution={contribution:.2%}")
check_close("contributions sum to one", sum(scales[k] * terms[k] / total for k in terms), 1.0)

10. Summary checks

Code cell 39

checks = []
checks.append(check_true("MSE gradient grows with residual", abs(2 * 5) > abs(np.sign(5))))
checks.append(check_true("Huber clips large residual gradient", abs(huber_grad[0]) <= delta + 1e-12))
checks.append(check_true("stable CE is finite", np.isfinite(ce).all()))
checks.append(check_true("masked mean uses valid denominator", mask.sum() == 5))
print(f"Passed {sum(checks)}/{len(checks)} summary checks.")

11. Reduction scale and gradient accumulation

Code cell 41

per_example = np.array([0.2, 0.4, 0.6, 0.8])
sum_reduction = per_example.sum()
mean_reduction = per_example.mean()
print("Sum reduction:", sum_reduction)
print("Mean reduction:", mean_reduction)
print("Gradient scale ratio sum/mean:", sum_reduction / mean_reduction)
check_close("sum equals batch_size times mean", sum_reduction, len(per_example) * mean_reduction)

12. Gaussian NLL with learned variance

Code cell 43

y = np.array([0.0, 1.0, 2.0])
mu = np.array([0.1, 0.8, 2.5])
log_sigma = np.array([-1.0, 0.0, 0.5])
sigma2 = np.exp(2 * log_sigma)
gaussian_nll = 0.5 * ((y - mu) ** 2 / sigma2 + 2 * log_sigma + np.log(2 * np.pi))
print("Gaussian NLL per example:", np.round(gaussian_nll, 4))
check_true("Gaussian NLL finite", np.isfinite(gaussian_nll).all())

13. Class imbalance and positive weighting

Code cell 45

logits_binary = np.array([-2.0, -1.0, 0.5, 1.0, 2.0])
y_binary = np.array([0, 0, 0, 1, 1])
base = bce_logits(logits_binary, y_binary)
pos_weight = 4.0
weighted = np.where(y_binary == 1, pos_weight * base, base)
print("Base BCE:", np.round(base, 4))
print("Positive-weighted BCE:", np.round(weighted, 4))
print("Mean base:", base.mean(), "mean weighted:", weighted.mean())
check_true("positive weighting increases positive contribution", weighted[y_binary == 1].sum() > base[y_binary == 1].sum())

14. Proper scoring intuition

Code cell 47

true_p = 0.7
reports = np.linspace(0.01, 0.99, 200)
expected_log_loss = -(true_p * np.log(reports) + (1 - true_p) * np.log(1 - reports))
best_report = reports[np.argmin(expected_log_loss)]
print("True probability:", true_p)
print("Best report under expected log loss:", round(float(best_report), 3))
check_true("log loss minimized near true probability", abs(best_report - true_p) < 0.01)

15. Temperature changes CE gradients

Code cell 49

base_logits = np.array([[2.0, 1.0, 0.0]])
target = np.array([0])
for temp in [0.5, 1.0, 2.0]:
    probs_temp = softmax(base_logits / temp, axis=1)
    grad_temp = (probs_temp - np.eye(3)[target]) / temp
    print(f"temperature={temp}: probs={np.round(probs_temp, 4)}, grad_norm={np.linalg.norm(grad_temp):.4f}")

16. Label smoothing changes the target vector

Code cell 51

logit = np.array([[3.0, 0.0, -1.0, -2.0]])
hard_target = np.array([[1.0, 0.0, 0.0, 0.0]])
for eps in [0.0, 0.1, 0.3]:
    target = (1 - eps) * hard_target + eps / hard_target.shape[1]
    grad = softmax(logit, axis=1) - target
    print(f"epsilon={eps}: target={np.round(target, 3)}, grad={np.round(grad, 3)}")

17. Negative sampling as a loss-design choice

Code cell 53

pos_score = 2.0
neg_scores_easy = np.array([-3.0, -2.5, -2.0])
neg_scores_hard = np.array([1.8, 1.6, 1.4])
loss_easy = -pos_score + logsumexp(np.r_[pos_score, neg_scores_easy], axis=0)
loss_hard = -pos_score + logsumexp(np.r_[pos_score, neg_scores_hard], axis=0)
print("InfoNCE with easy negatives:", loss_easy)
print("InfoNCE with hard negatives:", loss_hard)
check_true("hard negatives increase contrastive loss", loss_hard > loss_easy)

Skill Check

Test this lesson

Answer 4 quick questions to lock in the lesson and feed your adaptive practice queue.

--
Score
0/4
Answered
Not attempted
Status
1

Which module does this lesson belong to?

2

Which section is covered in this lesson content?

3

Which term is most central to this lesson?

4

What is the best way to use this lesson for real learning?

Your answers save locally first, then sync when account storage is available.
Practice queue