Exercises Notebook
Converted from
exercises.ipynbfor web reading.
Partial Derivatives and Gradients - Exercises
10 graded exercises covering the full section arc, from multivariate calculus mechanics to ML-facing gradient systems.
| Format | Description |
|---|---|
| Problem | Markdown cell with task description |
| Your Solution | Code cell for learner work |
| Solution | Reference solution with checks |
Difficulty: straightforward -> moderate -> challenging.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import numpy as np
import numpy.linalg as la
from scipy import optimize, special, stats
from scipy.optimize import minimize, fsolve, linprog
from math import factorial
import math
import matplotlib.patches as patches
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
HAS_MPL = True
np.set_printoptions(precision=8, suppress=True)
np.random.seed(42)
spmin = minimize
try:
import torch
HAS_TORCH = True
except ImportError:
torch = None
HAS_TORCH = False
def header(title):
print("\n" + "=" * len(title))
print(title)
print("=" * len(title))
def check_true(name, cond):
ok = bool(cond)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
return ok
def check_close(name, got, expected, tol=1e-8):
ok = np.allclose(got, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} - {name}: got {got}, expected {expected}")
return ok
def check(name, got, expected, tol=1e-8):
return check_close(name, got, expected, tol=tol)
def sigmoid(x):
x = np.asarray(x, dtype=float)
return np.where(x >= 0, 1/(1+np.exp(-x)), np.exp(x)/(1+np.exp(x)))
def softmax(z, axis=-1):
z = np.asarray(z, dtype=float)
z = z - np.max(z, axis=axis, keepdims=True)
e = np.exp(z)
return e / np.sum(e, axis=axis, keepdims=True)
def relu(x):
return np.maximum(0, x)
def relu_prime(x):
return np.where(np.asarray(x) > 0, 1.0, 0.0)
def centered_diff(f, x, h=1e-6):
return (f(x + h) - f(x - h)) / (2 * h)
def numerical_gradient(f, x, h=1e-6):
x = np.asarray(x, dtype=float)
grad = np.zeros_like(x, dtype=float)
for idx in np.ndindex(x.shape):
xp = x.copy(); xm = x.copy()
xp[idx] += h; xm[idx] -= h
grad[idx] = (f(xp) - f(xm)) / (2*h)
return grad
def numerical_jacobian(f, x, h=1e-6):
x = np.asarray(x, dtype=float)
y0 = np.asarray(f(x), dtype=float)
J = np.zeros((y0.size, x.size), dtype=float)
flat = x.reshape(-1)
for j in range(x.size):
xp = flat.copy(); xm = flat.copy()
xp[j] += h; xm[j] -= h
J[:, j] = ((np.asarray(f(xp.reshape(x.shape))) - np.asarray(f(xm.reshape(x.shape)))) / (2*h)).reshape(-1)
return J.reshape(y0.shape + x.shape)
def grad_check(f, x, analytic_grad, h=1e-6):
numeric_grad = numerical_gradient(f, x, h=h)
denom = la.norm(analytic_grad) + la.norm(numeric_grad) + 1e-12
return la.norm(analytic_grad - numeric_grad) / denom
def jacobian_fd(f, x, h=1e-6):
x = np.asarray(x, dtype=float)
y0 = np.asarray(f(x), dtype=float)
J = np.zeros((y0.size, x.size), dtype=float)
flat = x.reshape(-1)
for j in range(x.size):
xp = flat.copy(); xm = flat.copy()
xp[j] += h; xm[j] -= h
yp = np.asarray(f(xp.reshape(x.shape)), dtype=float).reshape(-1)
ym = np.asarray(f(xm.reshape(x.shape)), dtype=float).reshape(-1)
J[:, j] = (yp - ym) / (2*h)
return J.reshape(y0.shape + x.shape)
def hessian_fd(f, x, h=1e-5):
x = np.asarray(x, dtype=float)
H = np.zeros((x.size, x.size), dtype=float)
flat = x.reshape(-1)
for j in range(x.size):
xp = flat.copy(); xm = flat.copy()
xp[j] += h; xm[j] -= h
gp = numerical_gradient(lambda z: f(z.reshape(x.shape)), xp.reshape(x.shape), h=h).reshape(-1)
gm = numerical_gradient(lambda z: f(z.reshape(x.shape)), xm.reshape(x.shape), h=h).reshape(-1)
H[:, j] = (gp - gm) / (2*h)
return H.reshape(x.shape + x.shape)
def grad_fd(f, x, h=1e-6):
return numerical_gradient(f, x, h=h)
def fd_grad(f, x, h=1e-6):
return numerical_gradient(f, np.asarray(x, dtype=float), h=h)
print("Chapter helper setup complete.")
Exercise 1 (★): Partial Derivatives: Computation and Interpretation
Compute all first-order partial derivatives of the following functions and evaluate them at the given points. Verify numerically using centered finite differences.
(a) . Compute and analytically, then evaluate at .
(b) . Compute all three partials at .
(c) For , compute analytically. Verify: .
(d) Compute the second-order mixed partial for from part (a) and verify Clairaut's theorem: .
Code cell 5
# Your Solution
# Exercise 1 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 1.")
Code cell 6
# Solution
# Exercise 1 - reference solution
# (a) f(x, y) = x^3*y + exp(xy)
# df/dx = 3x^2*y + y*exp(xy)
# df/dy = x^3 + x*exp(xy)
def df_dx(x, y): return 3*x**2*y + y*np.exp(x*y)
def df_dy(x, y): return x**3 + x*np.exp(x*y)
f = lambda xy: xy[0]**3*xy[1] + np.exp(xy[0]*xy[1])
x0, y0 = 1.0, 2.0
num_dx = (f(np.array([x0+1e-5, y0])) - f(np.array([x0-1e-5, y0]))) / 2e-5
num_dy = (f(np.array([x0, y0+1e-5])) - f(np.array([x0, y0-1e-5]))) / 2e-5
header('Exercise 1: Partial Derivatives')
check_close('df/dx at (1,2)', df_dx(x0, y0), num_dx)
check_close('df/dy at (1,2)', df_dy(x0, y0), num_dy)
# (b) g(x,y,z) = sin(xy) + y*z^2 - ln(x+z)
# dg/dx = y*cos(xy) - 1/(x+z)
# dg/dy = x*cos(xy) + z^2
# dg/dz = 2yz - 1/(x+z)
def dg_dx(x,y,z): return y*np.cos(x*y) - 1/(x+z)
def dg_dy(x,y,z): return x*np.cos(x*y) + z**2
def dg_dz(x,y,z): return 2*y*z - 1/(x+z)
g = lambda xyz: np.sin(xyz[0]*xyz[1]) + xyz[1]*xyz[2]**2 - np.log(xyz[0]+xyz[2])
p = np.array([1., 0., 1.])
ng = numerical_gradient(g, p)
check_close('dg/dx at (1,0,1)', dg_dx(*p), ng[0])
check_close('dg/dy at (1,0,1)', dg_dy(*p), ng[1])
check_close('dg/dz at (1,0,1)', dg_dz(*p), ng[2])
# (c) grad ||w||^2 = 2w
def grad_norm_sq(w): return 2 * w
w = np.array([1., 2., 3.])
h_fn = lambda w: np.dot(w, w)
check_close('grad ||w||^2', grad_norm_sq(w), numerical_gradient(h_fn, w))
# (d) Clairaut's theorem: d^2f/dxdy = d^2f/dydx
# d^2f/dxdy = 3x^2 + exp(xy) + xy*exp(xy)
h = 1e-4
dfdxy = (f(np.array([x0+h, y0+h])) - f(np.array([x0+h, y0-h]))
- f(np.array([x0-h, y0+h])) + f(np.array([x0-h, y0-h]))) / (4*h**2)
exact = 3*x0**2 + np.exp(x0*y0) + x0*y0*np.exp(x0*y0)
check_close('Clairaut d2f/dxdy = d2f/dydx', dfdxy, exact, tol=1e-4)
print('\nTakeaway: Centered finite differences achieve O(h^2) accuracy; '
'Clairaut\'s theorem holds when partials are continuous.')
print("Exercise 1 solution complete.")
Exercise 2 (★): Gradient Geometry
Let .
(a) Compute analytically.
(b) At the point , find the gradient and its magnitude .
(c) Show numerically that the gradient is perpendicular to the level curve at . (Hint: parametrize the level curve near and compute the dot product with the tangent direction.)
(d) Find the unit vector that minimises (steepest descent direction) and verify .
Code cell 8
# Your Solution
# Exercise 2 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 2.")
Code cell 9
# Solution
# Exercise 2 - reference solution
f2 = lambda xy: xy[0]**2 + xy[0]*xy[1] + xy[1]**2
def grad_f(x, y):
return np.array([2*x + y, x + 2*y])
def steepest_descent_dir(x, y):
g = grad_f(x, y)
return -g / la.norm(g)
p = np.array([1.0, 1.0])
g = grad_f(*p)
header('Exercise 2: Gradient Geometry')
check_close('gradient at (1,1)', g, numerical_gradient(f2, p))
check_close('gradient magnitude', la.norm(g), np.sqrt(9 + 9)**.5 * np.sqrt(2))
# Correction: ||g|| = ||(3,3)|| = 3*sqrt(2)
check_close('gradient magnitude', la.norm(g), 3*np.sqrt(2))
# (c) Level curve f=3 near p=(1,1): parametrise as (1+dt, y(dt)) where y satisfies f=3
# Approximate tangent: t = (-df/dy, df/dx) / norm (perpendicular to grad)
tangent = np.array([-g[1], g[0]]) # rotate grad by 90 degrees
tangent /= la.norm(tangent)
dot = np.dot(g, tangent)
check_close('grad ⊥ level curve tangent (dot=0)', dot, 0.0, tol=1e-10)
# (d) Steepest descent
u = steepest_descent_dir(*p)
D_u = np.dot(g, u)
check_close('directional deriv in steepest descent = -||grad||', D_u, -la.norm(g))
check_close('u is unit vector', la.norm(u), 1.0)
print('\nTakeaway: The gradient points perpendicular to level curves; '
'steepest descent is exactly -∇f/‖∇f‖.')
print("Exercise 2 solution complete.")
Exercise 3 (★): Directional Derivatives
Let (a Gaussian bump centred at the origin).
(a) Compute at for as a function of . Plot it for .
(b) Find the value of that maximises and the value that minimises it.
(c) Find all directions such that . These are the neutral directions (tangent to the level set).
(d) Verify numerically that the directional derivative formula matches the limit definition as .
Code cell 11
# Your Solution
# Exercise 3 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 3.")
Code cell 12
# Solution
# Exercise 3 - reference solution
f3 = lambda xy: np.exp(xy[0]**2 + xy[1]**2)
p3 = np.array([1.0, 0.0])
# Gradient of f at p: [2x*exp(x^2+y^2), 2y*exp(x^2+y^2)]
def grad_f3(p):
val = np.exp(p[0]**2 + p[1]**2)
return np.array([2*p[0]*val, 2*p[1]*val])
def directional_deriv(p, theta):
u = np.array([np.cos(theta), np.sin(theta)])
return np.dot(grad_f3(p), u)
g3 = grad_f3(p3)
thetas = np.linspace(0, 2*np.pi, 1000)
Dvs = np.array([directional_deriv(p3, t) for t in thetas])
header('Exercise 3: Directional Derivatives')
# (a)-(b) max/min
theta_max = thetas[np.argmax(Dvs)]
theta_min = thetas[np.argmin(Dvs)]
check_close('max D_u f = ||grad||', Dvs.max(), la.norm(g3), tol=1e-3)
check_close('min D_u f = -||grad||', Dvs.min(), -la.norm(g3), tol=1e-3)
check_close('theta_max ≈ 0 (gradient direction)', theta_max, 0.0, tol=0.01)
# (c) Neutral directions: theta where D_u f = 0
neutral_thetas = [np.pi/2, 3*np.pi/2] # perpendicular to gradient
for t in neutral_thetas:
check_close(f'D_u f=0 at theta={t:.3f}', directional_deriv(p3, t), 0.0, tol=1e-10)
# (d) Limit definition vs formula
theta_test = 0.7
u = np.array([np.cos(theta_test), np.sin(theta_test)])
for h in [1e-2, 1e-4, 1e-6]:
limit_approx = (f3(p3 + h*u) - f3(p3)) / h
formula = np.dot(g3, u)
err = abs(limit_approx - formula)
print(f'h={h:.0e}: limit={limit_approx:.6f}, formula={formula:.6f}, error={err:.2e}')
print('\nTakeaway: D_u f = ∇f·u maximised (=‖∇f‖) along ∇f; '
'zero in the two perpendicular directions tangent to the level set.')
print("Exercise 3 solution complete.")
Exercise 4 (★★): Matrix Calculus from First Principles
Prove the following gradient identities and verify numerically at a random point.
(a) For (linear function), prove .
(b) For (quadratic form) with symmetric , prove .
(c) For (least squares objective), prove .
(d) For (trace function, ), show that so .
Code cell 14
# Your Solution
# Exercise 4 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 4.")
Code cell 15
# Solution
# Exercise 4 - reference solution
import numpy as np
np.random.seed(42)
def numerical_gradient(f, x, h=1e-5):
grad = np.zeros_like(x, dtype=float)
for i in range(len(x)):
ei = np.zeros_like(x, dtype=float); ei[i] = 1.0
grad[i] = (f(x + h*ei) - f(x - h*ei)) / (2*h)
return grad
def check_close(name, got, expected, tol=1e-5):
ok = np.allclose(got, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} — {name}")
if not ok: print(' exp:', expected, '\n got:', got)
def grad_linear(a, x): return a
def grad_quadratic(A, x): return 2 * A @ x
def grad_least_squares(A, b, x): return 2 * A.T @ (A @ x - b)
n = 4
a = np.random.randn(n)
A = np.random.randn(n, n); A = A + A.T
b = np.random.randn(n)
x = np.random.randn(n)
print('=' * 50)
print('Exercise 4: Matrix Calculus')
print('=' * 50)
# (a) grad(a^T x) = a
f_lin = lambda x: np.dot(a, x)
check_close('grad(a^Tx) = a', grad_linear(a, x), numerical_gradient(f_lin, x))
# (b) grad(x^T A x) = 2Ax
f_quad = lambda x: x @ A @ x
check_close('grad(x^TAx) = 2Ax', grad_quadratic(A, x), numerical_gradient(f_quad, x))
# (c) grad(||Ax-b||^2) = 2A^T(Ax-b)
f_ls = lambda x: np.sum((A @ x - b)**2)
check_close('grad(||Ax-b||^2)', grad_least_squares(A, b, x), numerical_gradient(f_ls, x))
# (d) grad_X tr(AX) = A^T
n2 = 3
A2 = np.random.randn(n2, n2)
X2 = np.random.randn(n2, n2)
f_tr = lambda X_flat: np.trace(A2 @ X_flat.reshape(n2, n2))
num_grad = numerical_gradient(f_tr, X2.flatten()).reshape(n2, n2)
check_close('grad_X tr(AX) = A^T', num_grad, A2.T)
print('\nTakeaway: All ML loss gradients reduce to these three matrix calculus identities; '
'proving them rigorously via index notation builds the foundation for backprop.')
print("Exercise 4 solution complete.")
Exercise 5 (★★): MSE Gradient and Linear Regression
The mean squared error for linear regression is:
(a) Derive .
(b) Implement gradient descent and run it on a synthetic dataset with samples, features, true weights , noise .
(c) Verify that the GD solution converges to the OLS solution .
(d) Plot the training loss curve. How many steps does it take to reach within of ?
Code cell 17
# Your Solution
# Exercise 5 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 5.")
Code cell 18
# Solution
# Exercise 5 - reference solution
import numpy as np
np.random.seed(42)
def mse_grad(X, y, w):
m = len(y)
return (2 / m) * X.T @ (X @ w - y)
def gradient_descent_lr(X, y, w0, lr, n_steps):
m = len(y)
w = w0.copy()
losses = []
for _ in range(n_steps):
loss = np.mean((X @ w - y)**2)
losses.append(loss)
w -= lr * mse_grad(X, y, w)
losses.append(np.mean((X @ w - y)**2))
return w, losses
m, n = 200, 5
w_true = np.array([1., -2., 0.5, 3., -1.])
X = np.random.randn(m, n)
y = X @ w_true + 0.1 * np.random.randn(m)
w0 = np.zeros(n)
w_gd, losses = gradient_descent_lr(X, y, w0, lr=0.1, n_steps=300)
# OLS solution
w_ols = np.linalg.solve(X.T @ X, X.T @ y)
loss_star = np.mean((X @ w_ols - y)**2)
print('=' * 50)
print('Exercise 5: MSE Gradient and Linear Regression')
print('=' * 50)
def check_close(name, got, expected, tol=1e-3):
ok = np.allclose(got, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} — {name}")
check_close('GD converges to OLS solution', w_gd, w_ols, tol=1e-3)
check_close('GD converges to true weights', w_gd, w_true, tol=0.05)
# Step to 1e-4 above optimal
losses_arr = np.array(losses)
target = loss_star + 1e-4
steps_needed = np.searchsorted(-(losses_arr - loss_star), -1e-4)
print(f'Steps to reach L* + 1e-4: {steps_needed}')
print(f'Final loss: {losses[-1]:.6f}, OLS loss: {loss_star:.6f}')
try:
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8-whitegrid')
fig, ax = plt.subplots(figsize=(8, 4))
ax.semilogy(losses_arr - loss_star + 1e-10, color='#2166AC', lw=2)
ax.axhline(1e-4, color='#D6604D', ls='--', lw=1.5, label='Target: L*+1e-4')
ax.set_xlabel('Step'); ax.set_ylabel('L(w) - L* (log)')
ax.set_title('MSE gradient descent convergence')
ax.legend()
plt.tight_layout()
plt.savefig('/tmp/mse_gd.png', dpi=100, bbox_inches='tight')
plt.show()
except ImportError:
pass
print('\nTakeaway: MSE gradient descent converges to the OLS solution; '
'convergence rate ∝ 1 - η*λ_max(X^TX)/m, so conditioning matters.')
print("Exercise 5 solution complete.")
Exercise 6 (★★): Gradient Checking
Implement a general-purpose gradient checker and use it to debug three functions: one with a correct gradient, one with a sign error, and one with a scaling error.
(a) Implement gradient_check(f, grad_f, x, h=1e-5) that returns the relative error:
where is the numerical gradient and is the provided gradient.
(b) Test on:
- Correct: ,
- Sign error: (wrong sign)
- Scaling error: (missing factor of 2)
(c) Apply the checker to a two-layer neural network's weight gradient: with . Implement and verify it passes the gradient check (rel_error ).
Code cell 20
# Your Solution
# Exercise 6 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 6.")
Code cell 21
# Solution
# Exercise 6 - reference solution
import numpy as np
np.random.seed(42)
def numerical_gradient(f, x, h=1e-5):
grad = np.zeros_like(x, dtype=float)
for i in range(len(x)):
ei = np.zeros_like(x, dtype=float); ei[i] = 1.0
grad[i] = (f(x + h*ei) - f(x - h*ei)) / (2*h)
return grad
def gradient_check(f, grad_f_val, x, h=1e-5, eps=1e-8):
num_grad = numerical_gradient(f, x, h)
analytic = np.asarray(grad_f_val, dtype=float)
diff = np.linalg.norm(num_grad - analytic)
denom = np.linalg.norm(num_grad) + np.linalg.norm(analytic) + eps
return diff / denom
print('=' * 55)
print('Exercise 6: Gradient Checking')
print('=' * 55)
x_test = np.random.randn(5)
f_sq = lambda x: np.dot(x, x)
def check_true(name, cond):
print(f"{'PASS' if cond else 'FAIL'} — {name}")
err_correct = gradient_check(f_sq, 2*x_test, x_test)
err_sign = gradient_check(f_sq, -2*x_test, x_test)
err_scale = gradient_check(f_sq, x_test, x_test)
print(f'Correct (2x): rel_error = {err_correct:.2e}')
print(f'Sign error (-2x): rel_error = {err_sign:.2e}')
print(f'Scale error (x): rel_error = {err_scale:.2e}')
check_true('correct gradient passes (err < 1e-5)', err_correct < 1e-5)
check_true('sign error fails (err > 1e-1)', err_sign > 1e-1)
check_true('scale error fails (err > 1e-1)', err_scale > 1e-1)
# (c) Two-layer neural network gradient
np.random.seed(0)
m_nn, d_in, d_h = 20, 4, 6
X_nn = np.random.randn(m_nn, d_in)
v = np.random.randn(d_h)
y_nn = np.random.randn(m_nn)
W = np.random.randn(d_in, d_h) * 0.1
def f_nn(W_flat):
W_ = W_flat.reshape(d_in, d_h)
H = np.tanh(X_nn @ W_)
return np.sum((H @ v - y_nn)**2)
# Analytic gradient:
# dL/dW = X^T [ 2*(H@v - y) * v^T * (1 - H^2) ]
def grad_nn(W):
H = np.tanh(X_nn @ W)
delta = 2 * (H @ v - y_nn) # (m,)
dH = delta[:, None] * v[None, :] # (m, d_h)
dW = X_nn.T @ (dH * (1 - H**2)) # (d_in, d_h)
return dW
g_analytic = grad_nn(W)
err_nn = gradient_check(f_nn, g_analytic.flatten(), W.flatten())
print(f'\n2-layer network gradient: rel_error = {err_nn:.2e}')
check_true('network gradient passes check (err < 1e-5)', err_nn < 1e-5)
print('\nTakeaway: Gradient checking is indispensable when implementing backprop; '
'rel_error < 1e-5 is the gold standard for floating-point correctness.')
print("Exercise 6 solution complete.")
Exercise 7 (★★★): Softmax Gradient and Cross-Entropy
The softmax function maps logits to probabilities:
(a) Derive the Jacobian of softmax: . Express in matrix form: .
(b) The cross-entropy loss for true label is . Show (where is the one-hot vector).
(c) Implement stable softmax (using the log-sum-exp trick) and its gradient, then verify with gradient checking.
(d) For a batch of examples with one-hot matrix and logit matrix , show where .
Code cell 23
# Your Solution
# Exercise 7 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 7.")
Code cell 24
# Solution
# Exercise 7 - reference solution
import numpy as np
np.random.seed(42)
def numerical_gradient(f, x, h=1e-5):
grad = np.zeros_like(x, dtype=float)
for i in range(len(x)):
ei = np.zeros_like(x, dtype=float); ei[i] = 1.0
grad[i] = (f(x + h*ei) - f(x - h*ei)) / (2*h)
return grad
def check_close(name, got, expected, tol=1e-5):
ok = np.allclose(got, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} — {name}")
if not ok: print(' exp:', expected, '\n got:', got)
def softmax(z):
z_shifted = z - z.max() # numerical stability
e = np.exp(z_shifted)
return e / e.sum()
def softmax_jacobian(z):
p = softmax(z)
return np.diag(p) - np.outer(p, p)
def ce_loss(z, y_true):
p = softmax(z)
return -np.log(p[y_true] + 1e-15)
def ce_grad(z, y_true):
p = softmax(z)
g = p.copy()
g[y_true] -= 1.0
return g
print('=' * 55)
print('Exercise 7: Softmax Gradient and Cross-Entropy')
print('=' * 55)
z = np.array([2.0, 1.0, 0.1])
y = 0
p = softmax(z)
# (a) Jacobian
J = softmax_jacobian(z)
J_expected = np.diag(p) - np.outer(p, p)
check_close('softmax Jacobian = diag(p) - pp^T', J, J_expected)
# Verify Jacobian via numerical diff
J_num = np.zeros((3, 3))
for j in range(3):
ej = np.eye(3)[j] * 1e-5
J_num[:, j] = (softmax(z + ej) - softmax(z - ej)) / (2e-5)
check_close('Jacobian matches numerical', J, J_num)
# (b) CE gradient = p - e_y
g_analytic = ce_grad(z, y)
f_ce = lambda z_: ce_loss(z_, y)
g_numerical = numerical_gradient(f_ce, z)
check_close('CE gradient = p - e_y', g_analytic, g_numerical)
# (d) Batch gradient
m_b, K = 10, 4
Z = np.random.randn(m_b, K)
Y_idx = np.random.randint(0, K, size=m_b)
Y_onehot = np.eye(K)[Y_idx]
# Batch probabilities
P = np.array([softmax(Z[i]) for i in range(m_b)])
batch_grad = (P - Y_onehot) / m_b
# Numerical check on one weight element
def f_batch(Z_flat):
Z_ = Z_flat.reshape(m_b, K)
total = 0.0
for i in range(m_b):
total += ce_loss(Z_[i], Y_idx[i])
return total / m_b
num_batch = numerical_gradient(f_batch, Z.flatten()).reshape(m_b, K)
check_close('batch grad = (P-Y)/m', batch_grad, num_batch, tol=1e-4)
print('\nTakeaway: The elegance of softmax+CE: gradient is simply p-y, '
'the prediction error — the same formula powers GPT training.')
print("Exercise 7 solution complete.")
Exercise 8 (★★★): Natural Gradient for Gaussian Model
The natural gradient preconditions updates by the inverse Fisher Information Matrix (FIM), making steps geometry-aware in probability space:
For a Gaussian model with parameters , the FIM has a known diagonal form.
(a) Show that for , the FIM is:
when using parameters .
(b) Generate samples from . Starting from (i.e. ), fit using gradient descent on with:
- Standard gradient descent
- Natural gradient descent
(c) Plot the parameter trajectories in space on top of FIM-scaled contours of the negative log-likelihood. Observe that natural gradient takes a more direct path.
(d) Verify that at convergence, both methods recover and .
Code cell 26
# Your Solution
# Exercise 8 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 8.")
Code cell 27
# Solution
# Exercise 8 - reference solution
import numpy as np
np.random.seed(42)
def check_close(name, got, expected, tol=0.05):
ok = np.allclose(got, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} — {name}")
if not ok: print(f' expected {expected}, got {got}')
N = 500
mu_true, sigma_true = 3.0, 0.5
data = mu_true + sigma_true * np.random.randn(N)
def neg_log_likelihood(theta, data):
mu, log_s = theta[0], theta[1]
sigma = np.exp(log_s)
return 0.5 * np.sum(((data - mu) / sigma)**2) + N * log_s + 0.5 * N * np.log(2 * np.pi)
def nll_grad(theta, data):
mu, log_s = theta[0], theta[1]
sigma = np.exp(log_s)
# d NLL / d mu = -sum (x_i - mu) / sigma^2
grad_mu = -np.sum(data - mu) / sigma**2
# d NLL / d log_sigma = N - sum (x_i-mu)^2 / sigma^2
grad_ls = N - np.sum((data - mu)**2) / sigma**2
return np.array([grad_mu, grad_ls])
def fisher_info(theta):
sigma = np.exp(theta[1])
return np.diag([N / sigma**2, 2 * N])
def run_optimizer(theta0, data, lr, n_steps, use_natural=False):
theta = theta0.copy()
path = [theta.copy()]
for _ in range(n_steps):
g = nll_grad(theta, data)
if use_natural:
F = fisher_info(theta)
step = np.linalg.solve(F, g)
else:
step = g
theta -= lr * step
path.append(theta.copy())
return np.array(path)
theta0 = np.array([0.0, 0.0])
gd_path = run_optimizer(theta0, data, lr=1e-4, n_steps=300, use_natural=False)
ng_path = run_optimizer(theta0, data, lr=0.01, n_steps=300, use_natural=True)
print('=' * 55)
print('Exercise 8: Natural Gradient')
print('=' * 55)
# (d) Verify convergence
gd_mu, gd_sigma = gd_path[-1][0], np.exp(gd_path[-1][1])
ng_mu, ng_sigma = ng_path[-1][0], np.exp(ng_path[-1][1])
print(f'GD final: mu={gd_mu:.4f}, sigma={gd_sigma:.4f}')
print(f'NG final: mu={ng_mu:.4f}, sigma={ng_sigma:.4f}')
print(f'True: mu={mu_true:.4f}, sigma={sigma_true:.4f}')
check_close('GD recovers mu', gd_mu, mu_true, tol=0.1)
check_close('NG recovers mu', ng_mu, mu_true, tol=0.05)
check_close('NG recovers sigma', ng_sigma, sigma_true, tol=0.05)
# (b) Steps to converge: compare NLL values
nll_star = neg_log_likelihood(np.array([mu_true, np.log(sigma_true)]), data)
gd_nlls = [neg_log_likelihood(t, data) - nll_star for t in gd_path]
ng_nlls = [neg_log_likelihood(t, data) - nll_star for t in ng_path]
try:
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8-whitegrid')
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
# Trajectory in parameter space
mu_g = np.linspace(-0.5, 3.5, 100)
ls_g = np.linspace(-1.5, 0.8, 100)
MU, LS = np.meshgrid(mu_g, ls_g)
NLL = np.array([[neg_log_likelihood(np.array([m, l]), data)
for m in mu_g] for l in ls_g])
ax = axes[0]
ax.contourf(MU, LS, NLL, levels=30, cmap='viridis', alpha=0.5)
ax.plot(gd_path[:,0], gd_path[:,1], 'o-', ms=2, lw=1.5, color='#D6604D', label='GD')
ax.plot(ng_path[:,0], ng_path[:,1], 's-', ms=2, lw=1.5, color='#2166AC', label='Natural GD')
ax.plot(mu_true, np.log(sigma_true), 'k*', ms=14, label='True params')
ax.set_xlabel('$\\mu$'); ax.set_ylabel('$\\log\\sigma$')
ax.set_title('Parameter space trajectories')
ax.legend(fontsize=9)
axes[1].semilogy(np.maximum(gd_nlls, 1e-3), color='#D6604D', lw=2, label='GD')
axes[1].semilogy(np.maximum(ng_nlls, 1e-3), color='#2166AC', lw=2, label='Natural GD')
axes[1].set_xlabel('Step'); axes[1].set_ylabel('NLL - NLL* (log)')
axes[1].set_title('NLL convergence')
axes[1].legend()
plt.tight_layout()
plt.savefig('/tmp/natural_gradient.png', dpi=100, bbox_inches='tight')
plt.show()
except ImportError:
pass
print('\nTakeaway: Natural gradient removes the ill-conditioning from parameter space; '
'FIM^{-1}∇L converts Euclidean gradient steps to Riemannian steps on the '
'statistical manifold — the foundation of K-FAC, EKFAC, and Shampoo optimizers.')
print("Exercise 8 solution complete.")
Exercise 9 (★★★): Softmax Cross-Entropy Gradient
For logits , probabilities , and one-hot label , the loss is
Derive and verify it numerically.
Code cell 29
# Your Solution
# Exercise 9 - learner workspace
# Derive and verify the softmax cross-entropy gradient.
print("Learner workspace ready for Exercise 9.")
Code cell 30
# Solution
# Exercise 9 - softmax cross-entropy gradient
header("Exercise 9: softmax cross-entropy gradient")
z = np.array([1.2, -0.3, 0.7, 2.0])
y = np.array([0.0, 0.0, 0.0, 1.0])
def loss(z):
p = softmax(z)
return -np.sum(y * np.log(p + 1e-12))
p = softmax(z)
analytic = p - y
numeric = numerical_gradient(loss, z)
print("probabilities:", p)
print("analytic grad:", analytic)
print("numeric grad:", numeric)
check_close("gradient p-y", analytic, numeric, tol=1e-6)
check_close("gradient sums to zero", analytic.sum(), 0.0, tol=1e-10)
print("Takeaway: multiclass backprop starts with prediction error in logit space.")
Exercise 10 (★★★): FGSM Direction as Steepest Loss Increase
For logistic loss with , compute and show that the fastest increase under an perturbation uses
Code cell 32
# Your Solution
# Exercise 10 - learner workspace
# Compute the input gradient and compare random perturbations with FGSM.
print("Learner workspace ready for Exercise 10.")
Code cell 33
# Solution
# Exercise 10 - FGSM direction
header("Exercise 10: FGSM steepest direction")
w = np.array([0.8, -1.1, 0.4])
x = np.array([0.2, 0.5, -0.7])
y = 1.0
eps = 0.05
def logistic_loss_x(x):
margin = y * (w @ x)
return np.logaddexp(0.0, -margin)
def grad_x(x):
margin = y * (w @ x)
return -y * w * sigmoid(-margin)
g = grad_x(x)
delta_fgsm = eps * np.sign(g)
base = logistic_loss_x(x)
fgsm_increase = logistic_loss_x(x + delta_fgsm) - base
rng = np.random.default_rng(42)
random_increases = []
for _ in range(2000):
delta = rng.uniform(-eps, eps, size=x.shape)
random_increases.append(logistic_loss_x(x + delta) - base)
print("grad_x:", g)
print(f"FGSM increase={fgsm_increase:.8f}, best random={max(random_increases):.8f}")
check_true("FGSM beats sampled random l_inf perturbations", fgsm_increase >= max(random_increases) - 1e-8)
print("Takeaway: adversarial first-order attacks are directional derivative geometry.")
What to Review After Finishing
- Partial derivatives — can you compute for polynomial, exponential, and composite functions from the limit definition?
- Gradient geometry — can you show level sets and that is the steepest ascent direction via Cauchy-Schwarz?
- Directional derivatives — can you find max/min directions and identify neutral directions?
- Matrix calculus — do you know the three key identities (, , ) and can you prove them?
- ML gradients — can you derive and implement the MSE and CE gradients?
- Gradient checking — do you know the relative error formula and when to use it?
- Softmax + CE gradient — do you understand why the gradient is ?
- Natural gradient — can you explain why standard GD is geometry-blind and how the FIM corrects this?
References
- Goodfellow, Bengio & Courville, Deep Learning (2016) — Ch. 4 (Numerical Computation)
- Boyd & Vandenberghe, Convex Optimization (2004) — Ch. 3 (Convex Functions)
- Amari, Information Geometry and Its Applications (2016) — Ch. 1–2 (Natural Gradient)
- Magnus & Neudecker, Matrix Differential Calculus (2019) — canonical reference
- Matrix Cookbook — identity reference (equations 61–82 for gradient identities)