ML Specific MathMath for LLMs

ML Specific Math

ML Specific Math

Exercises Notebook

Converted from exercises.ipynb for web reading.

Loss Functions - Exercises

Ten graded exercises. Each exercise has a problem statement, a runnable learner scaffold, and a complete solution cell.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np

def header(title):
    print("\n" + "=" * 72)
    print(title)
    print("=" * 72)

def check_close(name, value, expected, tol=1e-7):
    ok = np.allclose(value, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}: value={value}, expected={expected}")
    return ok

def check_true(name, condition):
    ok = bool(condition)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    return ok

def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

def logsumexp(logits, axis=-1):
    m = np.max(logits, axis=axis, keepdims=True)
    return np.squeeze(m + np.log(np.sum(np.exp(logits - m), axis=axis, keepdims=True)), axis=axis)

def softmax(logits, axis=-1):
    shifted = logits - np.max(logits, axis=axis, keepdims=True)
    exp_shifted = np.exp(shifted)
    return exp_shifted / np.sum(exp_shifted, axis=axis, keepdims=True)

print("Exercise helpers ready.")

Exercise 1: MSE gradient (*)

Compute MSE and its gradient for residuals y_pred - y_true.

Code cell 5

# Your Solution
y_true = np.array([1.0, 2.0, 3.0])
y_pred = np.array([1.5, 1.0, 5.0])
mse = None
grad = None
print("mse:", mse)
print("grad:", grad)

Code cell 6

# Solution
header("Exercise 1: MSE gradient")
y_true = np.array([1.0, 2.0, 3.0])
y_pred = np.array([1.5, 1.0, 5.0])
residual = y_pred - y_true
mse = np.mean(residual ** 2)
grad = 2 * residual / len(residual)
check_close("mse", mse, (0.5**2 + (-1.0)**2 + 2.0**2) / 3)
check_close("gradient", grad, np.array([1.0, -2.0, 4.0]) / 3)
print("\nTakeaway: MSE gives large residuals proportionally larger gradients.")

Exercise 2: Huber loss (*)

Implement Huber loss for residuals with delta=1.

Code cell 8

# Your Solution
residual = np.array([-2.0, -0.5, 0.0, 0.5, 3.0])
delta = 1.0
huber = None
print("huber:", huber)

Code cell 9

# Solution
header("Exercise 2: Huber loss")
residual = np.array([-2.0, -0.5, 0.0, 0.5, 3.0])
delta = 1.0
huber = np.where(np.abs(residual) <= delta, 0.5 * residual**2, delta * (np.abs(residual) - 0.5 * delta))
expected = np.array([1.5, 0.125, 0.0, 0.125, 2.5])
check_close("Huber values", huber, expected)
print("\nTakeaway: Huber is quadratic near zero and linear in the tails.")

Exercise 3: BCE from logits (*)

Compute binary cross-entropy directly from logits.

Code cell 11

# Your Solution
z = np.array([-2.0, 0.0, 2.0])
y = np.array([0.0, 1.0, 1.0])
loss = None
print("loss:", loss)

Code cell 12

# Solution
header("Exercise 3: BCE from logits")
z = np.array([-2.0, 0.0, 2.0])
y = np.array([0.0, 1.0, 1.0])
loss = np.maximum(z, 0) - z * y + np.log1p(np.exp(-np.abs(z)))
prob_loss = -(y * np.log(sigmoid(z)) + (1 - y) * np.log(1 - sigmoid(z)))
check_close("stable BCE equals probability BCE", loss, prob_loss)
print("\nTakeaway: Fused logit-space BCE avoids unstable probability logs.")

Exercise 4: Stable softmax CE (**)

Implement multiclass cross-entropy from logits using log-sum-exp.

Code cell 14

# Your Solution
logits = np.array([[2.0, 1.0, 0.0], [1000.0, 999.0, 998.0]])
targets = np.array([0, 2])
ce = None
print("ce:", ce)

Code cell 15

# Solution
header("Exercise 4: Stable softmax CE")
logits = np.array([[2.0, 1.0, 0.0], [1000.0, 999.0, 998.0]])
targets = np.array([0, 2])
ce = -logits[np.arange(len(targets)), targets] + logsumexp(logits, axis=1)
expected_first = -2.0 + np.log(np.exp(2.0) + np.exp(1.0) + np.exp(0.0))
check_close("first CE", ce[0], expected_first)
check_true("huge-logit CE finite", np.isfinite(ce).all())
print("\nTakeaway: subtracting the max inside log-sum-exp makes CE finite.")

Exercise 5: Masked sequence loss (**)

Compute a valid-token mean from token losses and a binary mask.

Code cell 17

# Your Solution
losses = np.array([[0.2, 0.5, 0.0], [0.3, 0.0, 0.0]])
mask = np.array([[1, 1, 0], [1, 0, 0]])
masked_mean = None
print("masked_mean:", masked_mean)

Code cell 18

# Solution
header("Exercise 5: Masked sequence loss")
losses = np.array([[0.2, 0.5, 0.0], [0.3, 0.0, 0.0]])
mask = np.array([[1, 1, 0], [1, 0, 0]])
masked_mean = np.sum(losses * mask) / np.sum(mask)
check_close("masked mean", masked_mean, (0.2 + 0.5 + 0.3) / 3)
print("\nTakeaway: sequence losses should divide by valid tokens, not padded length.")

Exercise 6: Focal loss (**)

Compare BCE and focal loss for easy and hard positive examples.

Code cell 20

# Your Solution
pt = np.array([0.95, 0.55, 0.10])
gamma = 2.0
focal = None
print("focal:", focal)

Code cell 21

# Solution
header("Exercise 6: Focal loss")
pt = np.array([0.95, 0.55, 0.10])
gamma = 2.0
bce = -np.log(pt)
focal = ((1 - pt) ** gamma) * bce
check_true("easy example downweighted most", focal[0] / bce[0] < focal[1] / bce[1] < focal[2] / bce[2])
print("BCE:", bce)
print("Focal:", focal)
print("\nTakeaway: focal loss preserves hard examples and suppresses easy ones.")

Exercise 7: InfoNCE (**)

Compute InfoNCE as row-wise cross-entropy over a similarity matrix.

Code cell 23

# Your Solution
sim = np.array([[2.0, 0.5, 0.1], [0.0, 1.5, 0.2], [0.1, 0.4, 1.0]])
temperature = 0.5
loss = None
print("loss:", loss)

Code cell 24

# Solution
header("Exercise 7: InfoNCE")
sim = np.array([[2.0, 0.5, 0.1], [0.0, 1.5, 0.2], [0.1, 0.4, 1.0]])
temperature = 0.5
labels = np.arange(sim.shape[0])
logits = sim / temperature
loss = -logits[np.arange(3), labels] + logsumexp(logits, axis=1)
check_true("positive diagonal gives low mean loss", loss.mean() < 0.5)
print("InfoNCE losses:", loss)
print("\nTakeaway: InfoNCE is cross-entropy where the correct class is the positive pair.")

Exercise 8: Triplet loss (***)

Compute triplet loss and identify whether the triplet violates the margin.

Code cell 26

# Your Solution
anchor = np.array([0.0, 0.0])
positive = np.array([0.4, 0.0])
negative = np.array([0.6, 0.0])
margin = 0.5
loss = None
print("loss:", loss)

Code cell 27

# Solution
header("Exercise 8: Triplet loss")
anchor = np.array([0.0, 0.0])
positive = np.array([0.4, 0.0])
negative = np.array([0.6, 0.0])
margin = 0.5
d_pos = np.sum((anchor - positive) ** 2)
d_neg = np.sum((anchor - negative) ** 2)
loss = max(0.0, d_pos - d_neg + margin)
check_close("triplet loss", loss, 0.16 - 0.36 + 0.5)
check_true("triplet violates margin", loss > 0)
print("\nTakeaway: triplet loss trains only when the positive is not closer by enough margin.")

Exercise 9: Preference loss (***)

Implement a DPO-style pairwise logistic loss.

Code cell 29

# Your Solution
logp_w = np.array([-2.0, -1.5])
logp_l = np.array([-3.0, -1.0])
ref_w = np.array([-2.2, -1.4])
ref_l = np.array([-2.8, -1.1])
beta = 0.2
loss = None
print("loss:", loss)

Code cell 30

# Solution
header("Exercise 9: Preference loss")
logp_w = np.array([-2.0, -1.5])
logp_l = np.array([-3.0, -1.0])
ref_w = np.array([-2.2, -1.4])
ref_l = np.array([-2.8, -1.1])
beta = 0.2
advantage = beta * ((logp_w - ref_w) - (logp_l - ref_l))
loss = np.logaddexp(0.0, -advantage)
check_true("preference loss finite", np.isfinite(loss).all())
check_true("better relative advantage lowers logistic loss", loss[0] < np.log(2))
print("\nTakeaway: preference losses optimize relative chosen-vs-rejected log probabilities.")

Exercise 10: Loss balancing (***)

Compute contribution shares for a weighted multi-term objective.

Code cell 32

# Your Solution
terms = {"ce": 0.8, "kl": 0.05, "aux": 2.0}
weights = {"ce": 1.0, "kl": 10.0, "aux": 0.1}
shares = None
print("shares:", shares)

Code cell 33

# Solution
header("Exercise 10: Loss balancing")
terms = {"ce": 0.8, "kl": 0.05, "aux": 2.0}
weights = {"ce": 1.0, "kl": 10.0, "aux": 0.1}
total = sum(terms[k] * weights[k] for k in terms)
shares = {k: terms[k] * weights[k] / total for k in terms}
check_close("shares sum to one", sum(shares.values()), 1.0)
check_true("CE dominates after weighting", shares["ce"] > shares["aux"])
print("shares:", shares)
print("\nTakeaway: loss coefficients should be judged by weighted contribution, not by raw value.")
PreviousNext