Exercises Notebook

Converted from exercises.ipynb for web reading.

Chain Rule and Backpropagation - Exercises

10 graded exercises covering the full section arc, from multivariate calculus mechanics to ML-facing gradient systems.

Format	Description
Problem	Markdown cell with task description
Your Solution	Code cell for learner work
Solution	Reference solution with checks

Difficulty: straightforward -> moderate -> challenging.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import numpy.linalg as la
from scipy import optimize, special, stats
from scipy.optimize import minimize, fsolve, linprog
from math import factorial
import math
import matplotlib.patches as patches

COLORS = {
    "primary": "#0077BB",
    "secondary": "#EE7733",
    "tertiary": "#009988",
    "error": "#CC3311",
    "neutral": "#555555",
    "highlight": "#EE3377",
}
HAS_MPL = True
np.set_printoptions(precision=8, suppress=True)
np.random.seed(42)
spmin = minimize

try:
    import torch
    HAS_TORCH = True
except ImportError:
    torch = None
    HAS_TORCH = False


def header(title):
    print("\n" + "=" * len(title))
    print(title)
    print("=" * len(title))

def check_true(name, cond):
    ok = bool(cond)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    return ok

def check_close(name, got, expected, tol=1e-8):
    ok = np.allclose(got, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}: got {got}, expected {expected}")
    return ok

def check(name, got, expected, tol=1e-8):
    return check_close(name, got, expected, tol=tol)

def sigmoid(x):
    x = np.asarray(x, dtype=float)
    return np.where(x >= 0, 1/(1+np.exp(-x)), np.exp(x)/(1+np.exp(x)))

def softmax(z, axis=-1):
    z = np.asarray(z, dtype=float)
    z = z - np.max(z, axis=axis, keepdims=True)
    e = np.exp(z)
    return e / np.sum(e, axis=axis, keepdims=True)

def relu(x):
    return np.maximum(0, x)

def relu_prime(x):
    return np.where(np.asarray(x) > 0, 1.0, 0.0)

def centered_diff(f, x, h=1e-6):
    return (f(x + h) - f(x - h)) / (2 * h)

def numerical_gradient(f, x, h=1e-6):
    x = np.asarray(x, dtype=float)
    grad = np.zeros_like(x, dtype=float)
    for idx in np.ndindex(x.shape):
        xp = x.copy(); xm = x.copy()
        xp[idx] += h; xm[idx] -= h
        grad[idx] = (f(xp) - f(xm)) / (2*h)
    return grad

def numerical_jacobian(f, x, h=1e-6):
    x = np.asarray(x, dtype=float)
    y0 = np.asarray(f(x), dtype=float)
    J = np.zeros((y0.size, x.size), dtype=float)
    flat = x.reshape(-1)
    for j in range(x.size):
        xp = flat.copy(); xm = flat.copy()
        xp[j] += h; xm[j] -= h
        J[:, j] = ((np.asarray(f(xp.reshape(x.shape))) - np.asarray(f(xm.reshape(x.shape)))) / (2*h)).reshape(-1)
    return J.reshape(y0.shape + x.shape)

def grad_check(f, x, analytic_grad, h=1e-6):
    numeric_grad = numerical_gradient(f, x, h=h)
    denom = la.norm(analytic_grad) + la.norm(numeric_grad) + 1e-12
    return la.norm(analytic_grad - numeric_grad) / denom



def jacobian_fd(f, x, h=1e-6):
    x = np.asarray(x, dtype=float)
    y0 = np.asarray(f(x), dtype=float)
    J = np.zeros((y0.size, x.size), dtype=float)
    flat = x.reshape(-1)
    for j in range(x.size):
        xp = flat.copy(); xm = flat.copy()
        xp[j] += h; xm[j] -= h
        yp = np.asarray(f(xp.reshape(x.shape)), dtype=float).reshape(-1)
        ym = np.asarray(f(xm.reshape(x.shape)), dtype=float).reshape(-1)
        J[:, j] = (yp - ym) / (2*h)
    return J.reshape(y0.shape + x.shape)

def hessian_fd(f, x, h=1e-5):
    x = np.asarray(x, dtype=float)
    H = np.zeros((x.size, x.size), dtype=float)
    flat = x.reshape(-1)
    for j in range(x.size):
        xp = flat.copy(); xm = flat.copy()
        xp[j] += h; xm[j] -= h
        gp = numerical_gradient(lambda z: f(z.reshape(x.shape)), xp.reshape(x.shape), h=h).reshape(-1)
        gm = numerical_gradient(lambda z: f(z.reshape(x.shape)), xm.reshape(x.shape), h=h).reshape(-1)
        H[:, j] = (gp - gm) / (2*h)
    return H.reshape(x.shape + x.shape)



def grad_fd(f, x, h=1e-6):
    return numerical_gradient(f, x, h=h)



def fd_grad(f, x, h=1e-6):
    return numerical_gradient(f, np.asarray(x, dtype=float), h=h)

print("Chapter helper setup complete.")

Exercise 1 (★): Scalar Chain Rule Verification

Let $f(t) = \sin(t^2)$ and $g(t) = e^{3t}$ .

(a) Compute $\frac{d}{dt}[f(g(t))]$ analytically using the chain rule.

(b) Evaluate the derivative at $t = 0$ and $t = 0.5$ .

(c) Verify numerically using centred finite differences with $h = 10^{-5}$ .

(d) Compute $\frac{d}{dt}[g(f(t))]$ — explain why the order matters.

Code cell 5

# Your Solution
# Exercise 1 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 1.")

Code cell 6

# Solution
# Exercise 1 - reference solution

f = lambda t: np.sin(t**2)
g = lambda t: np.exp(3*t)

# (a) d/dt[f(g(t))] = f'(g(t)) * g'(t)
#   f'(u) = cos(u^2) * 2u
#   g'(t) = 3 exp(3t)
def chain_fg_prime(t):
    gt = g(t)
    return np.cos(gt**2) * 2*gt * 3*np.exp(3*t)

# (d) d/dt[g(f(t))] = g'(f(t)) * f'(t)
#   g'(u) = 3 exp(3u)
#   f'(t) = cos(t^2) * 2t
def chain_gf_prime(t):
    ft = f(t)
    return 3*np.exp(3*ft) * np.cos(t**2)*2*t

header('Exercise 1: Scalar Chain Rule')

for t_val in [0.0, 0.5]:
    h_comp = lambda t: f(g(t))
    h_val = chain_fg_prime(t_val)
    fd_val = (h_comp(t_val+1e-5) - h_comp(t_val-1e-5)) / (2e-5)
    check_close(f'f∘g chain rule at t={t_val}', h_val, fd_val)

# (d) Order matters
t = 0.5
fg_val = chain_fg_prime(t)
gf_val = chain_gf_prime(t)
print(f'\n(d) d/dt[f(g(t))] at t=0.5: {fg_val:.6f}')
print(f'    d/dt[g(f(t))] at t=0.5: {gf_val:.6f}')
check_true('f∘g ≠ g∘f (order matters)', not np.isclose(fg_val, gf_val))

# Verify g∘f
h2_comp = lambda t: g(f(t))
fd_gf = (h2_comp(t+1e-5) - h2_comp(t-1e-5)) / (2e-5)
check_close('g∘f chain rule', chain_gf_prime(t), fd_gf)

print('\nTakeaway: Chain rule order follows composition order — Jacobians multiply left-to-right.')

print("Exercise 1 solution complete.")

Exercise 2 (★): Jacobian Composition

Let $f: \mathbb{R}^3 \to \mathbb{R}^2$ , $f(\mathbf{u}) = (u_1 u_2,\; u_2 + u_3^2)$ and $g: \mathbb{R}^2 \to \mathbb{R}^3$ , $g(\mathbf{x}) = (x_1^2,\; x_1 x_2,\; e^{x_2})$ .

(a) Compute $J_f(\mathbf{u})$ and $J_g(\mathbf{x})$ analytically.

(b) Compute $J_h = J_f(g(\mathbf{x})) \cdot J_g(\mathbf{x})$ using the chain rule at $\mathbf{x}_0 = (1, 0)^\top$ .

(c) Verify against finite differences.

(d) Compute $J_h$ directly from $h = f \circ g$ and confirm it matches (b).

Code cell 8

# Your Solution
# Exercise 2 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 2.")

Code cell 9

# Solution
# Exercise 2 - reference solution

def g_fn(x):
    return np.array([x[0]**2, x[0]*x[1], np.exp(x[1])])

def f_fn(u):
    return np.array([u[0]*u[1], u[1] + u[2]**2])

def Jf(u):
    return np.array([
        [u[1],  u[0],  0.0  ],
        [0.0,   1.0,   2*u[2]]
    ])

def Jg(x):
    return np.array([
        [2*x[0],  0.0       ],
        [x[1],    x[0]      ],
        [0.0,     np.exp(x[1])]
    ])

x0 = np.array([1.0, 0.0])
gx0 = g_fn(x0)

header('Exercise 2: Jacobian Composition')

# (a) Jacobians at x0
print('J_g(x0):', Jg(x0))
print('J_f(g(x0)):', Jf(gx0))

# (b) Chain rule
Jh_chain = Jf(gx0) @ Jg(x0)
print('J_h via chain rule:')
print(Jh_chain.round(6))

# (c) Finite differences
h_fn = lambda x: f_fn(g_fn(x))
Jh_fd = jacobian_fd(h_fn, x0)
check_close('(c) Chain rule = FD', Jh_chain, Jh_fd)

# (d) Direct computation
# h(x) = f(g(x)) = (x1^2 * x1*x2,  x1*x2 + exp(2*x2))
# = (x1^3 * x2,  x1*x2 + exp(2*x2))
# dh1/dx1 = 3x1^2*x2 = 0 at (1,0);  dh1/dx2 = x1^3 = 1
# dh2/dx1 = x2 = 0;  dh2/dx2 = x1 + 2*exp(2*x2) = 1 + 2*1 = 3
Jh_direct = np.array([[0.0, 1.0], [0.0, 3.0]])  # at (1,0)
check_close('(d) Direct = chain rule', Jh_chain, Jh_direct)

print('\nTakeaway: J_{f∘g}(x) = J_f(g(x)) · J_g(x) — composition order is left-to-right.')

print("Exercise 2 solution complete.")

Exercise 3 (★): Backprop Through a 2-Layer Network

Network: $\mathbf{z}^{[1]} = W^{[1]}\mathbf{x} + \mathbf{b}^{[1]}$ , $\mathbf{a}^{[1]} = \text{relu}(\mathbf{z}^{[1]})$ , $\hat{y} = \mathbf{w}^{[2]} \cdot \mathbf{a}^{[1]} + b^{[2]}$ , $\mathcal{L} = \tfrac{1}{2}(\hat{y} - y)^2$ .

Use $W^{[1]} \in \mathbb{R}^{3 \times 2}$ , $\mathbf{x} \in \mathbb{R}^2$ , $\mathbf{w}^{[2]} \in \mathbb{R}^3$ .

(a) Implement forward pass and compute $\mathcal{L}$ .

(b) Implement backward pass manually.

(c) Verify all gradients against finite differences.

(d) Run 100 gradient descent steps with $\eta = 0.01$ and verify loss decreases.

Code cell 11

# Your Solution
# Exercise 3 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 3.")

Code cell 12

# Solution
# Exercise 3 - reference solution

np.random.seed(42)
W1 = np.array([[0.5, 0.1], [0.2, 0.3], [-0.1, 0.4]])
b1 = np.array([0.0, 0.0, 0.0])
w2 = np.array([0.4, 0.6, -0.2])
b2 = 0.0
x  = np.array([1.0, 2.0])
y  = 1.0

def forward(W1, b1, w2, b2, x):
    z1 = W1 @ x + b1
    a1 = relu(z1)
    yhat = w2 @ a1 + b2
    return yhat, a1, z1

def backward(W1, w2, z1, a1, x, yhat, y):
    delta_out = yhat - y
    dw2 = delta_out * a1
    db2 = delta_out
    d_a1 = delta_out * w2
    delta1 = d_a1 * relu_prime(z1)
    dW1 = np.outer(delta1, x)
    db1 = delta1
    return dW1, db1, dw2, db2

yhat, a1, z1 = forward(W1, b1, w2, b2, x)
loss = 0.5*(yhat - y)**2
dW1, db1, dw2, db2 = backward(W1, w2, z1, a1, x, yhat, y)

header('Exercise 3: 2-Layer Network Backprop')
print(f'(a) Loss: {loss:.6f}  yhat={yhat:.6f}')

# (c) Verify gradients
def loss_fn(W1_flat):
    W1_ = W1_flat.reshape(3, 2)
    yh, _, _ = forward(W1_, b1, w2, b2, x)
    return 0.5*(yh - y)**2

dW1_fd = grad_fd(loss_fn, W1.flatten()).reshape(3, 2)
check_close('(c) dW1 matches FD', dW1, dW1_fd)

def loss_fn_w2(w2_):
    yh, _, _ = forward(W1, b1, w2_, b2, x)
    return 0.5*(yh - y)**2

dw2_fd = grad_fd(loss_fn_w2, w2)
check_close('(c) dw2 matches FD', dw2, dw2_fd)

# (d) Gradient descent
lr = 0.01
W1_gd, b1_gd, w2_gd, b2_gd = W1.copy(), b1.copy(), w2.copy(), b2
losses_gd = []
for step in range(100):
    yh, a1_, z1_ = forward(W1_gd, b1_gd, w2_gd, b2_gd, x)
    losses_gd.append(0.5*(yh - y)**2)
    dW1_, db1_, dw2_, db2_ = backward(W1_gd, w2_gd, z1_, a1_, x, yh, y)
    W1_gd -= lr * dW1_; b1_gd -= lr * db1_
    w2_gd -= lr * dw2_; b2_gd -= lr * db2_

check_true('(d) Loss decreases', losses_gd[-1] < losses_gd[0])
print(f'    Initial loss: {losses_gd[0]:.6f}  Final loss: {losses_gd[-1]:.6f}')

print('\nTakeaway: Backprop recurrence δ^l = (W^{l+1})^T δ^{l+1} ⊙ σ\'(z^l) correctly computes all gradients.')

print("Exercise 3 solution complete.")

Exercise 4 (★★): Vanishing Gradient Analysis

(a) Build a 20-layer sigmoid network with weights all $= 0.3$ . Compute gradient norms at layers 1, 5, 10, 20.

(b) Repeat with ReLU activation and He initialisation.

(c) Add residual connections to the sigmoid network. Compare gradient norms.

(d) Apply Xavier initialisation to the sigmoid network.

(e) Plot gradient norm vs. layer depth for all four cases.

Code cell 14

# Your Solution
# Exercise 4 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 4.")

Code cell 15

# Solution
# Exercise 4 - reference solution

np.random.seed(42)
L_ex4 = 20
n_ex4 = 30
x_ex4 = np.random.randn(n_ex4)
x_ex4 /= la.norm(x_ex4)

def compute_grad_norms(weight_scale, activation='sigmoid', residual=False, n_layers=L_ex4, n=n_ex4):
    np.random.seed(1)
    Ws = [np.random.randn(n, n) * weight_scale for _ in range(n_layers)]
    # Forward
    a = x_ex4.copy(); preacts = []
    for W in Ws:
        z = W @ a; preacts.append(z)
        if activation == 'sigmoid': a_new = sigmoid(z)
        elif activation == 'relu': a_new = relu(z)
        else: a_new = np.tanh(z)
        if residual: a = a + a_new
        else: a = a_new
    # Backward
    delta = np.random.randn(n); delta /= la.norm(delta)
    grad_norms = []
    for l in range(n_layers-1, -1, -1):
        if activation == 'sigmoid': sp = sigmoid(preacts[l])*(1-sigmoid(preacts[l]))
        elif activation == 'relu': sp = relu_prime(preacts[l])
        else: sp = 1 - np.tanh(preacts[l])**2
        if residual:
            delta = delta + Ws[l].T @ (delta * sp)
        else:
            delta = Ws[l].T @ delta * sp
        grad_norms.append(la.norm(delta))
    return list(reversed(grad_norms))

norms_sigmoid = compute_grad_norms(0.3, 'sigmoid')
norms_relu    = compute_grad_norms(np.sqrt(2/n_ex4), 'relu')
norms_res     = compute_grad_norms(0.3, 'sigmoid', residual=True)
norms_xavier  = compute_grad_norms(np.sqrt(1/n_ex4), 'sigmoid')

header('Exercise 4: Vanishing Gradient Analysis')
layers_ex4 = list(range(1, L_ex4+1))

print('Gradient norm at layer 1:')
print(f'  Sigmoid (w=0.3):  {norms_sigmoid[0]:.2e}')
print(f'  ReLU (He):        {norms_relu[0]:.4f}')
print(f'  Sigmoid+Residual: {norms_res[0]:.4f}')
print(f'  Sigmoid (Xavier): {norms_xavier[0]:.4f}')

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.semilogy(layers_ex4, norms_sigmoid, 'r-o', markersize=4, label='Sigmoid (w=0.3)')
    ax.semilogy(layers_ex4, norms_relu,    'g-s', markersize=4, label='ReLU (He init)')
    ax.semilogy(layers_ex4, norms_res,     'b-^', markersize=4, label='Sigmoid + Residual')
    ax.semilogy(layers_ex4, norms_xavier,  'k-D', markersize=4, label='Sigmoid (Xavier)')
    ax.set_xlabel('Layer'); ax.set_ylabel('Gradient norm')
    ax.set_title('Gradient Norm vs Depth (20 layers)')
    ax.legend(); ax.grid(True, alpha=0.4)
    plt.tight_layout(); plt.show()

check_true('ReLU norm stable', abs(norms_relu[0] - norms_relu[-1]) < 2.0)
check_true('Sigmoid vanishes', norms_sigmoid[0] < 1e-3 * norms_sigmoid[-1])

print('\nTakeaway: Vanishing gradients are cured by ReLU+He-init or residual connections.')

print("Exercise 4 solution complete.")

Exercise 5 (★★): Gradient Checkpointing

(a) Implement a 10-layer feedforward network with full activation caching.

(b) Implement the same network with gradient checkpointing every 3 layers.

(c) Verify that both implementations produce identical gradients.

(d) Measure the memory reduction.

Code cell 17

# Your Solution
# Exercise 5 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 5.")

Code cell 18

# Solution
# Exercise 5 - reference solution

np.random.seed(42)
n_ck = 8
L_ck = 10
Ws_ck = [np.random.randn(n_ck, n_ck) * np.sqrt(2/n_ck) for _ in range(L_ck)]
x_ck = np.random.randn(n_ck)
y_ck = np.zeros(n_ck); y_ck[0] = 1.0

def standard_backprop(x, y, Ws):
    """Full caching — O(L) memory."""
    L = len(Ws)
    a = x.copy()
    cache = {'z': [], 'a': [a.copy()]}
    for W in Ws:
        z = W @ a
        a = relu(z)
        cache['z'].append(z); cache['a'].append(a.copy())
    loss = 0.5 * la.norm(a - y)**2
    delta = a - y
    grads = []
    for l in range(L-1, -1, -1):
        grads.append(np.outer(delta, cache['a'][l]))
        if l > 0:
            delta = Ws[l].T @ delta * relu_prime(cache['z'][l-1])
        else:
            delta = Ws[l].T @ delta
    return loss, list(reversed(grads))

def checkpointed_backprop(x, y, Ws, k=3):
    """Checkpoint every k layers — O(sqrt(L)) memory."""
    L = len(Ws)
    # Forward: save only checkpoints
    checkpoints = {0: x.copy()}
    a = x.copy()
    for l in range(L):
        z = Ws[l] @ a
        a = relu(z)
        if (l+1) % k == 0 or l == L-1:
            checkpoints[l+1] = a.copy()
    loss = 0.5 * la.norm(a - y)**2
    delta = a - y
    all_grads = [None] * L
    # Backward segment by segment
    seg_end = L
    while seg_end > 0:
        seg_start = max(c for c in checkpoints if c < seg_end)
        # Recompute activations for this segment
        a_seg = checkpoints[seg_start].copy()
        seg_acts = [a_seg.copy()]
        seg_preacts = []
        for l in range(seg_start, seg_end):
            z = Ws[l] @ a_seg
            a_seg = relu(z)
            seg_preacts.append(z); seg_acts.append(a_seg.copy())
        # Backward through segment
        for l in range(seg_end-1, seg_start-1, -1):
            idx = l - seg_start
            all_grads[l] = np.outer(delta, seg_acts[idx])
            if l > seg_start:
                delta = Ws[l].T @ delta * relu_prime(seg_preacts[idx-1])
            else:
                delta = Ws[l].T @ delta
        seg_end = seg_start
    return loss, all_grads

loss_std, grads_std = standard_backprop(x_ck, y_ck, Ws_ck)
loss_ck,  grads_ck  = checkpointed_backprop(x_ck, y_ck, Ws_ck, k=3)

header('Exercise 5: Gradient Checkpointing')
check_close('(c) Losses match', loss_std, loss_ck)
for l in range(L_ck):
    check_close(f'    Layer {l} gradient', grads_std[l], grads_ck[l])

# (d) Memory
mem_std = L_ck * n_ck * 8  # bytes (float64)
k_opt = int(np.sqrt(L_ck)) + 1
mem_ck  = (L_ck // k_opt + 1) * n_ck * 8
print(f'\n(d) Standard memory: {mem_std} bytes  ({L_ck} activation vectors)')
print(f'    Checkpoint memory: {mem_ck} bytes  ({L_ck//k_opt+1} checkpoints)')
print(f'    Reduction: {mem_std/mem_ck:.1f}x  (theoretical sqrt(L) ~ {np.sqrt(L_ck):.1f}x)')

print('\nTakeaway: Checkpointing trades ~33% extra compute for sqrt(L) memory reduction.')

print("Exercise 5 solution complete.")

Exercise 6 (★★): Attention Gradient

Single-head attention: $O = P V$ where $P = \text{softmax}(QK^\top/\sqrt{d})$ , $Q, K, V \in \mathbb{R}^{T \times d}$ with $T = 4$ , $d = 3$ .

(a) Implement the forward pass.

(b) Implement the backward pass computing $\bar{Q}, \bar{K}, \bar{V}$ given $\bar{O}$ .

(c) Verify all gradients against finite differences.

(d) Show that the memory for storing $P$ scales as $O(T^2)$ — explain why FlashAttention avoids this.

Code cell 20

# Your Solution
# Exercise 6 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 6.")

Code cell 21

# Solution
# Exercise 6 - reference solution

np.random.seed(1)
T_a, d_a = 4, 3
Q_a = np.random.randn(T_a, d_a)
K_a = np.random.randn(T_a, d_a)
V_a = np.random.randn(T_a, d_a)
scale_a = 1.0 / np.sqrt(d_a)

def softmax_2d(S):
    S = S - S.max(axis=1, keepdims=True)
    e = np.exp(S)
    return e / e.sum(axis=1, keepdims=True)

def attention_fwd(Q, K, V, scale):
    S = Q @ K.T * scale
    P = softmax_2d(S)
    O = P @ V
    return O, P

def softmax_bwd(P, dP):
    return P * (dP - (P * dP).sum(axis=1, keepdims=True))

def attention_bwd(dO, P, Q, K, V, scale):
    dV = P.T @ dO
    dP = dO @ V.T
    dS = softmax_bwd(P, dP)
    dQ = dS @ K * scale
    dK = dS.T @ Q * scale
    return dQ, dK, dV

O_a, P_a = attention_fwd(Q_a, K_a, V_a, scale_a)
dO_a = np.random.randn(T_a, d_a)
dQ_a, dK_a, dV_a = attention_bwd(dO_a, P_a, Q_a, K_a, V_a, scale_a)

header('Exercise 6: Attention Gradient')

# Verify dQ
def proj_Q(Q_flat):
    Q_ = Q_flat.reshape(T_a, d_a)
    O_, _ = attention_fwd(Q_, K_a, V_a, scale_a)
    return (dO_a * O_).sum()

dQ_fd = grad_fd(proj_Q, Q_a.flatten()).reshape(T_a, d_a)
check_close('(c) dQ matches FD', dQ_a, dQ_fd)

def proj_V(V_flat):
    V_ = V_flat.reshape(T_a, d_a)
    O_, _ = attention_fwd(Q_a, K_a, V_, scale_a)
    return (dO_a * O_).sum()

dV_fd = grad_fd(proj_V, V_a.flatten()).reshape(T_a, d_a)
check_close('(c) dV matches FD', dV_a, dV_fd)

def proj_K(K_flat):
    K_ = K_flat.reshape(T_a, d_a)
    O_, P_ = attention_fwd(Q_a, K_, V_a, scale_a)
    return (dO_a * O_).sum()

dK_fd = grad_fd(proj_K, K_a.flatten()).reshape(T_a, d_a)
check_close('(c) dK matches FD', dK_a, dK_fd)

# (d) Memory scaling
print(f'\n(d) Attention score matrix P: shape = ({T_a},{T_a}), memory = {T_a**2 * 4} bytes (float32)')
for T_val in [512, 2048, 8192, 32768]:
    mem_mb = T_val**2 * 2 / (1024**2)  # float16
    print(f'  T={T_val:6d}: P storage = {mem_mb:.1f} MB (float16)')

print('FlashAttention avoids storing P by recomputing it tile-by-tile in SRAM.')
print('\nTakeaway: Attention backward needs P; FlashAttention stores only O(T) statistics.')

print("Exercise 6 solution complete.")

Exercise 7 (★★★): LoRA Gradient Analysis

Linear layer $\mathbf{y} = (W_0 + BA)\mathbf{x}$ with $W_0 \in \mathbb{R}^{8 \times 6}$ frozen, $B \in \mathbb{R}^{8 \times 2}$ , $A \in \mathbb{R}^{2 \times 6}$ trainable.

(a) Implement forward and backward passes.

(b) Verify $\nabla_A \mathcal{L}$ and $\nabla_B \mathcal{L}$ against finite differences.

(c) Compare trainable parameter counts: LoRA vs full fine-tuning.

(d) Train LoRA for 200 steps on a toy problem. Compare convergence with full fine-tuning.

(e) Show that gradient memory scales as $O(r(m+n))$ vs $O(mn)$ for full fine-tuning.

Code cell 23

# Your Solution
# Exercise 7 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 7.")

Code cell 24

# Solution
# Exercise 7 - reference solution

np.random.seed(42)
m_lo, n_lo, r_lo = 8, 6, 2
W0 = np.random.randn(m_lo, n_lo)
B  = np.zeros((m_lo, r_lo))
A  = np.random.randn(r_lo, n_lo) * 0.01
x_lo = np.random.randn(n_lo)
y_bar = np.random.randn(m_lo)

# (a) Forward
y_lo = (W0 + B @ A) @ x_lo

# (b) Backward
# dA: upstream flows through B, then outer product with x
dA = np.outer(B.T @ y_bar, x_lo)  # (r, n)
# dB: upstream outer product with (A x)^T
dB = np.outer(y_bar, x_lo @ A.T)  # (m, r)

header('Exercise 7: LoRA Gradient Analysis')

def proj_A(A_flat):
    A_ = A_flat.reshape(r_lo, n_lo)
    y_ = (W0 + B @ A_) @ x_lo
    return y_bar @ y_

dA_fd = grad_fd(proj_A, A.flatten()).reshape(r_lo, n_lo)
check_close('(b) dA matches FD', dA, dA_fd)

def proj_B(B_flat):
    B_ = B_flat.reshape(m_lo, r_lo)
    y_ = (W0 + B_ @ A) @ x_lo
    return y_bar @ y_

dB_fd = grad_fd(proj_B, B.flatten()).reshape(m_lo, r_lo)
check_close('(b) dB matches FD', dB, dB_fd)

# (c) Parameter counts
n_full = m_lo * n_lo
n_lora = r_lo * (m_lo + n_lo)
print(f'\n(c) Full gradient params: {n_full}  ({m_lo}x{n_lo})')
print(f'    LoRA gradient params:  {n_lora}  ({r_lo}x({m_lo}+{n_lo}))')
print(f'    Compression: {n_full/n_lora:.1f}x')

# (d) Training comparison
np.random.seed(42)
W_target = np.random.randn(m_lo, n_lo) * 0.5
X_tr = np.random.randn(50, n_lo)
Y_tr = X_tr @ W_target.T

B_tr, A_tr = np.zeros((m_lo, r_lo)), np.random.randn(r_lo, n_lo)*0.01
W_full = np.random.randn(m_lo, n_lo) * 0.1
lr_tr = 0.005
loss_lora_tr, loss_full_tr = [], []

for _ in range(200):
    idx = np.random.choice(50, 8)
    xb, yb = X_tr[idx], Y_tr[idx]
    # LoRA
    yh = xb @ (W0 + B_tr @ A_tr).T
    L = 0.5*np.mean((yh-yb)**2)
    loss_lora_tr.append(L)
    dy = (yh-yb)/8
    dA_tr = (B_tr.T @ dy.T) @ xb / m_lo
    dB_tr = (dy.T @ xb) @ A_tr.T / m_lo
    A_tr -= lr_tr*dA_tr; B_tr -= lr_tr*dB_tr
    # Full
    yh2 = xb @ W_full.T
    L2 = 0.5*np.mean((yh2-yb)**2)
    loss_full_tr.append(L2)
    W_full -= lr_tr * ((yh2-yb).T @ xb)/8 / m_lo

print(f'\n(d) Final loss — LoRA: {loss_lora_tr[-1]:.4f},  Full: {loss_full_tr[-1]:.4f}')
check_true('Both converge', loss_lora_tr[-1] < loss_lora_tr[0])

# (e) Memory scaling
print(f'\n(e) Gradient memory:')
for m_, n_, r_ in [(100,100,4), (1000,1000,8), (4096,4096,16)]:
    full = m_*n_*4  # bytes float32
    lora = r_*(m_+n_)*4
    print(f'  W({m_}x{n_}) r={r_}: full={full/1e3:.0f}KB, LoRA={lora/1e3:.0f}KB, {full/lora:.0f}x')

print('\nTakeaway: LoRA backward scales as O(r(m+n)) vs O(mn) — enabling fine-tuning 70B models on 1 GPU.')

print("Exercise 7 solution complete.")

Exercise 8 (★★★): REINFORCE and STE

(a) For $z \sim \text{Bernoulli}(\sigma(\theta))$ , $\mathcal{L} = z^2$ , compute $\nabla_\theta \mathbb{E}[\mathcal{L}]$ analytically.

(b) Estimate the REINFORCE gradient with 10000 samples and verify.

(c) Implement variance reduction using a baseline $b = \mathbb{E}[\mathcal{L}]$ .

(d) Implement STE for quantisation: $\hat{w} = \text{round}(w)$ , $\mathcal{L} = (\hat{w} - w_t)^2$ .

(e) Compare STE-based quantisation-aware training vs post-training quantisation on a toy task.

Code cell 26

# Your Solution
# Exercise 8 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 8.")

Code cell 27

# Solution
# Exercise 8 - reference solution

header('Exercise 8: REINFORCE and STE')

def sig(t): return 1/(1+np.exp(-t))

theta_test = 0.5
p = sig(theta_test)

# (a) Analytical: E[L] = E[z^2] = sigma(theta) (since z in {0,1})
#    d/dtheta E[L] = d/dtheta sigma(theta) = sigma*(1-sigma)
analytical_grad = p * (1 - p)
print(f'(a) Analytical gradient at theta=0.5: {analytical_grad:.6f}')

# (b) REINFORCE: E[L * d/dtheta log p(z|theta)]
np.random.seed(42)
n_samp = 10000
z_samp = np.random.binomial(1, p, n_samp).astype(float)
L_samp = z_samp**2
# d/dtheta log p(z|theta): z=1 -> d/dtheta log sigma = 1-p; z=0 -> -p
score = np.where(z_samp == 1, 1-p, -p)
reinforce_est = np.mean(L_samp * score)
check_close('(b) REINFORCE gradient', reinforce_est, analytical_grad, tol=0.01)

# (c) Variance reduction with baseline b = E[L]
b_baseline = p  # E[z^2] = sigma(theta) = p
reinforce_baseline = np.mean((L_samp - b_baseline) * score)
print(f'\n(c) REINFORCE + baseline: {reinforce_baseline:.6f}')
print(f'    Variance without baseline: {np.var(L_samp * score):.6f}')
print(f'    Variance with baseline:    {np.var((L_samp-b_baseline)*score):.6f}')
var_reduction = np.var(L_samp*score) / np.var((L_samp-b_baseline)*score)
print(f'    Variance reduction: {var_reduction:.1f}x')

# (d) STE for quantisation
w = np.array([0.7, 1.3, -0.4, 2.1, -1.8])  # continuous weights
w_target = np.array([1.0, 1.0, 0.0, 2.0, -2.0])   # target quantised

n_ste_steps = 50
w_ste = w.copy()
lr_ste = 0.1
losses_ste = []

for _ in range(n_ste_steps):
    w_hat = np.round(w_ste)  # forward: quantise
    L_ste = 0.5 * np.sum((w_hat - w_target)**2)
    losses_ste.append(L_ste)
    # STE backward: treat round as identity -> grad flows as if w_hat = w_ste
    grad_ste = w_hat - w_target  # same as grad of continuous loss
    w_ste -= lr_ste * grad_ste

print(f'\n(d) STE training: initial loss={losses_ste[0]:.3f},  final loss={losses_ste[-1]:.3f}')

# (e) Post-training quantisation comparison
w_ptq = np.round(w)  # quantise without training
loss_ptq = 0.5 * np.sum((w_ptq - w_target)**2)
loss_ste_final = losses_ste[-1]
print(f'\n(e) PTQ loss (no training):      {loss_ptq:.4f}')
print(f'    STE-QAT loss (with training): {loss_ste_final:.4f}')
print(f'    Improvement: {loss_ptq/max(loss_ste_final,1e-8):.1f}x')
check_true('STE better than PTQ', loss_ste_final <= loss_ptq)

print('\nTakeaway: STE enables gradient-based optimisation through discrete operations.')
print('  Used in QAT (quantisation-aware training) — the basis of GPTQ, AWQ, QLoRA.')

print("Exercise 8 solution complete.")

Exercise 9 (★★★): Residual Blocks and Gradient Flow

For a residual map $h_{k+1}=h_k+F_k(h_k)$ , the local Jacobian is $I+J_{F_k}$ .

Compare gradient norm propagation through products of plain Jacobians $J_{F_k}$ and residual Jacobians $I+J_{F_k}$ .

Code cell 29

# Your Solution
# Exercise 9 - learner workspace
# Simulate products of Jacobians with and without residual identity paths.
print("Learner workspace ready for Exercise 9.")

Code cell 30

# Solution
# Exercise 9 - residual gradient flow
header("Exercise 9: residual gradient flow")

rng = np.random.default_rng(42)
d, L = 6, 20
Js = [0.12 * rng.normal(size=(d, d)) for _ in range(L)]
g = rng.normal(size=d)
plain = g.copy()
resid = g.copy()
for J in reversed(Js):
    plain = J.T @ plain
    resid = (np.eye(d) + J).T @ resid
print(f"initial norm={la.norm(g):.4e}")
print(f"plain backprop norm={la.norm(plain):.4e}")
print(f"residual backprop norm={la.norm(resid):.4e}")
check_true("residual path preserves larger gradient norm", la.norm(resid) > la.norm(plain))
print("Takeaway: identity paths add a stable derivative route through deep computation graphs.")

Exercise 10 (★★★): Backprop Through Dot-Product Attention Scores

Let $s_i=q^\top k_i/\sqrt{d}$ , $a=\operatorname{softmax}(s)$ , and $o=\sum_i a_i v_i$ . For scalar loss $L=c^\top o$ , derive gradients with respect to $q$ and the keys $k_i$ .

Code cell 32

# Your Solution
# Exercise 10 - learner workspace
# Differentiate scores, softmax, and the weighted sum.
print("Learner workspace ready for Exercise 10.")

Code cell 33

# Solution
# Exercise 10 - attention score backprop
header("Exercise 10: attention score backprop")

q = np.array([0.5, -0.4])
K = np.array([[0.2, 0.1], [-0.3, 0.7], [0.8, -0.2]])
V = np.array([[1.0, 0.0], [0.5, -0.5], [-0.2, 0.3]])
c = np.array([0.7, -0.1])
d = q.size

def loss(q, K):
    s = K @ q / np.sqrt(d)
    a = softmax(s)
    o = a @ V
    return c @ o

s = K @ q / np.sqrt(d)
a = softmax(s)
do = c
da = V @ do
J_soft = np.diag(a) - np.outer(a, a)
ds = J_soft @ da
grad_q = K.T @ ds / np.sqrt(d)
grad_K = np.outer(ds, q) / np.sqrt(d)
num_q = numerical_gradient(lambda qq: loss(qq, K), q)
num_K = numerical_gradient(lambda KK: loss(q, KK.reshape(K.shape)), K)
print("attention weights:", a)
print("grad_q:", grad_q)
check_close("q gradient", grad_q, num_q, tol=1e-6)
check_close("K gradient", grad_K, num_K, tol=1e-6)
print("Takeaway: attention backprop is ordinary chain rule through scores, softmax, and values.")

What to Review After Finishing

Exercise 1 — Scalar chain rule: composition order matters; $(f\circ g)' = f'(g)\cdot g'$
Exercise 2 — Jacobian chain rule: $J_{f\circ g} = J_f(g(\mathbf{x})) \cdot J_g(\mathbf{x})$
Exercise 3 — Backprop recurrence: $\boldsymbol{\delta}^{[l]} = (W^{[l+1]})^\top \boldsymbol{\delta}^{[l+1]} \odot \sigma'(\mathbf{z}^{[l]})$
Exercise 4 — Vanishing gradients cured by ReLU+He-init or residual connections
Exercise 5 — Checkpointing achieves $O(\sqrt{L})$ memory at $+33\%$ compute
Exercise 6 — Attention backward needs $P$ ; FlashAttention stores only $O(T)$ statistics
Exercise 7 — LoRA reduces gradient memory from $O(mn)$ to $O(r(m+n))$
Exercise 8 — STE passes gradients through discrete ops; REINFORCE for stochastic nodes

References

Rumelhart, Hinton & Williams (1986) — Original backpropagation paper
He et al. (2015) — He initialisation for ReLU networks
Dao et al. (2022) — FlashAttention: memory-efficient attention backward
Hu et al. (2022) — LoRA: low-rank adaptation and its backward pass
Williams (1992) — REINFORCE gradient estimator
Bengio et al. (2013) — Straight-through estimator

Chain Rule and Backpropagation