Theory Notebook
Converted from
theory.ipynbfor web reading.
Partial Derivatives and Gradients
"The gradient is the generalization of the derivative to functions of several variables, and it points in the direction in which the function increases most rapidly."
Interactive theory notebook covering partial derivatives, gradient fields, directional derivatives, and gradient applications in machine learning.
Topics: Scalar fields → Partial derivatives → Gradient vector → Directional derivatives → ML gradients → Loss landscapes → Numerical differentiation
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import numpy as np
import numpy.linalg as la
from scipy import optimize, special, stats
from scipy.optimize import minimize, fsolve, linprog
from math import factorial
import math
import matplotlib.patches as patches
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
HAS_MPL = True
np.set_printoptions(precision=8, suppress=True)
np.random.seed(42)
spmin = minimize
try:
import torch
HAS_TORCH = True
except ImportError:
torch = None
HAS_TORCH = False
def header(title):
print("\n" + "=" * len(title))
print(title)
print("=" * len(title))
def check_true(name, cond):
ok = bool(cond)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
return ok
def check_close(name, got, expected, tol=1e-8):
ok = np.allclose(got, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} - {name}: got {got}, expected {expected}")
return ok
def check(name, got, expected, tol=1e-8):
return check_close(name, got, expected, tol=tol)
def sigmoid(x):
x = np.asarray(x, dtype=float)
return np.where(x >= 0, 1/(1+np.exp(-x)), np.exp(x)/(1+np.exp(x)))
def softmax(z, axis=-1):
z = np.asarray(z, dtype=float)
z = z - np.max(z, axis=axis, keepdims=True)
e = np.exp(z)
return e / np.sum(e, axis=axis, keepdims=True)
def relu(x):
return np.maximum(0, x)
def relu_prime(x):
return np.where(np.asarray(x) > 0, 1.0, 0.0)
def centered_diff(f, x, h=1e-6):
return (f(x + h) - f(x - h)) / (2 * h)
def numerical_gradient(f, x, h=1e-6):
x = np.asarray(x, dtype=float)
grad = np.zeros_like(x, dtype=float)
for idx in np.ndindex(x.shape):
xp = x.copy(); xm = x.copy()
xp[idx] += h; xm[idx] -= h
grad[idx] = (f(xp) - f(xm)) / (2*h)
return grad
def numerical_jacobian(f, x, h=1e-6):
x = np.asarray(x, dtype=float)
y0 = np.asarray(f(x), dtype=float)
J = np.zeros((y0.size, x.size), dtype=float)
flat = x.reshape(-1)
for j in range(x.size):
xp = flat.copy(); xm = flat.copy()
xp[j] += h; xm[j] -= h
J[:, j] = ((np.asarray(f(xp.reshape(x.shape))) - np.asarray(f(xm.reshape(x.shape)))) / (2*h)).reshape(-1)
return J.reshape(y0.shape + x.shape)
def grad_check(f, x, analytic_grad, h=1e-6):
numeric_grad = numerical_gradient(f, x, h=h)
denom = la.norm(analytic_grad) + la.norm(numeric_grad) + 1e-12
return la.norm(analytic_grad - numeric_grad) / denom
def jacobian_fd(f, x, h=1e-6):
x = np.asarray(x, dtype=float)
y0 = np.asarray(f(x), dtype=float)
J = np.zeros((y0.size, x.size), dtype=float)
flat = x.reshape(-1)
for j in range(x.size):
xp = flat.copy(); xm = flat.copy()
xp[j] += h; xm[j] -= h
yp = np.asarray(f(xp.reshape(x.shape)), dtype=float).reshape(-1)
ym = np.asarray(f(xm.reshape(x.shape)), dtype=float).reshape(-1)
J[:, j] = (yp - ym) / (2*h)
return J.reshape(y0.shape + x.shape)
def hessian_fd(f, x, h=1e-5):
x = np.asarray(x, dtype=float)
H = np.zeros((x.size, x.size), dtype=float)
flat = x.reshape(-1)
for j in range(x.size):
xp = flat.copy(); xm = flat.copy()
xp[j] += h; xm[j] -= h
gp = numerical_gradient(lambda z: f(z.reshape(x.shape)), xp.reshape(x.shape), h=h).reshape(-1)
gm = numerical_gradient(lambda z: f(z.reshape(x.shape)), xm.reshape(x.shape), h=h).reshape(-1)
H[:, j] = (gp - gm) / (2*h)
return H.reshape(x.shape + x.shape)
def grad_fd(f, x, h=1e-6):
return numerical_gradient(f, x, h=h)
def fd_grad(f, x, h=1e-6):
return numerical_gradient(f, np.asarray(x, dtype=float), h=h)
print("Chapter helper setup complete.")
1. Functions of Multiple Variables — Scalar Fields
A scalar field assigns a number to each point. Visualizing as a surface provides geometric intuition for partial derivatives and gradients.
Code cell 5
# === 2.1 Scalar Fields — Surface and Contour Plots ===
x = np.linspace(-3, 3, 80)
y = np.linspace(-3, 3, 80)
X, Y = np.meshgrid(x, y)
# Three classic scalar fields
f_bowl = X**2 + Y**2 # paraboloid (convex, unique min)
f_saddle = X**2 - Y**2 # saddle (min in x, max in y)
f_gauss = np.exp(-(X**2 + Y**2)/2) # Gaussian (smooth hill)
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
fig.suptitle('Scalar Fields: $f: \\mathbb{R}^2 \\to \\mathbb{R}$', fontsize=15)
for ax, F, title in zip(axes,
[f_bowl, f_saddle, f_gauss],
['$f = x^2 + y^2$ (bowl)', '$f = x^2 - y^2$ (saddle)', '$f = e^{-(x^2+y^2)/2}$ (Gaussian)']):
cp = ax.contourf(X, Y, F, levels=20, cmap='plasma')
fig.colorbar(cp, ax=ax)
ax.contour(X, Y, F, levels=10, colors='white', alpha=0.4, linewidths=0.8)
ax.set_title(title)
ax.set_xlabel('$x$')
ax.set_ylabel('$y$')
fig.tight_layout()
plt.show()
print('Plotted three canonical scalar fields.')
2.2 Level Sets and Contour Geometry
The level set is a curve (in 2D) or hypersurface (in D) on which is constant. We will verify that at every point.
Code cell 7
# === 2.2 Level Sets and Gradient Direction ===
x = np.linspace(-2.5, 2.5, 200)
y = np.linspace(-2.5, 2.5, 200)
X, Y = np.meshgrid(x, y)
F = X**2 + Y**2 # paraboloid
# Gradient field (analytic): nabla f = (2x, 2y)
xg, yg = np.meshgrid(np.linspace(-2, 2, 10), np.linspace(-2, 2, 10))
Gx = 2 * xg # ∂f/∂x
Gy = 2 * yg # ∂f/∂y
fig, ax = plt.subplots(figsize=(7, 7))
cp = ax.contour(X, Y, F, levels=[0.5, 1, 2, 4, 7], colors=COLORS['primary'])
ax.clabel(cp, fmt='$f=%.1f$', fontsize=10)
ax.quiver(xg, yg, Gx, Gy, color=COLORS['error'], alpha=0.75,
label='$\\nabla f = (2x, 2y)^\\top$')
ax.set_title('Gradient $\\nabla f$ is perpendicular to level curves')
ax.set_xlabel('$x$'); ax.set_ylabel('$y$')
ax.set_aspect('equal')
ax.legend()
fig.tight_layout()
plt.show()
# Verify numerically: at (1, 1), gradient = (2,2), tangent direction ~ (-1,1)
grad_at_11 = np.array([2.0, 2.0])
tangent = np.array([-1.0, 1.0])
dot = np.dot(grad_at_11, tangent)
print(f'nabla f(1,1) = {grad_at_11}')
print(f'Tangent to level curve at (1,1): {tangent}')
print(f'Dot product (should be 0): {dot}')
ok = np.isclose(dot, 0)
print(f'PASS - gradient perpendicular to level set' if ok else 'FAIL')
3. Partial Derivatives
The partial derivative measures the rate of change of when we move parallel to the -axis, holding all other variables fixed.
Code cell 9
# === 3.1 Partial Derivative via Limit — Numerical Demo ===
def f(x, y):
return x**2 * y + np.exp(x * y)
# Analytic partials
def df_dx(x, y): return 2*x*y + y*np.exp(x*y)
def df_dy(x, y): return x**2 + x*np.exp(x*y)
a = (1.0, 2.0)
analytic_x = df_dx(*a)
analytic_y = df_dy(*a)
# Numerical limit: shrink h and watch convergence
print('Convergence of forward difference to partial derivative at (1, 2):')
print(f'Analytic ∂f/∂x = {analytic_x:.8f}')
print()
for h in [1e-1, 1e-3, 1e-5, 1e-7, 1e-9]:
numeric = (f(a[0]+h, a[1]) - f(*a)) / h
err = abs(numeric - analytic_x)
print(f' h={h:.0e} numeric={numeric:.8f} error={err:.2e}')
print()
print(f'Analytic ∂f/∂y = {analytic_y:.8f}')
numeric_y = (f(a[0], a[1]+1e-5) - f(*a)) / 1e-5
print(f'Numeric ∂f/∂y ≈ {numeric_y:.8f}')
ok = np.allclose([analytic_x, analytic_y], [df_dx(*a), df_dy(*a)])
print(f'PASS - analytic formulas verified' if ok else 'FAIL')
Code cell 10
# === 3.1 Geometric Meaning — Cross-Section Slopes ===
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
fig.suptitle('Partial Derivatives as Slopes of Cross-Sections', fontsize=15)
# Left: surface with cross-section lines
x = np.linspace(-2, 2, 100)
y_fixed = 1.0 # fixed y for ∂f/∂x
x_fixed = 0.5 # fixed x for ∂f/∂y
f_cross_x = f(x, y_fixed) # slice at y = y_fixed
f_cross_y = f(x_fixed, x) # slice at x = x_fixed
axes[0].plot(x, f_cross_x, color=COLORS['primary'], lw=2.5, label=f'$f(x, {y_fixed})$')
x0 = 1.0
slope_x = df_dx(x0, y_fixed)
tangent_x = f(x0, y_fixed) + slope_x * (x - x0)
axes[0].plot(x, tangent_x, '--', color=COLORS['error'],
label=f'tangent slope = $\\partial f/\\partial x$ = {slope_x:.2f}')
axes[0].scatter([x0], [f(x0, y_fixed)], color=COLORS['highlight'], s=80, zorder=5)
axes[0].set_title(f'Slice at $y={y_fixed}$: slope = $\\partial f/\\partial x$')
axes[0].set_xlabel('$x$'); axes[0].set_ylabel('$f(x, y_0)$')
axes[0].legend(fontsize=10)
axes[1].plot(x, f_cross_y, color=COLORS['secondary'], lw=2.5, label=f'$f({x_fixed}, y)$')
y0 = 2.0
slope_y = df_dy(x_fixed, y0)
tangent_y = f(x_fixed, y0) + slope_y * (x - y0)
axes[1].plot(x, tangent_y, '--', color=COLORS['error'],
label=f'tangent slope = $\\partial f/\\partial y$ = {slope_y:.2f}')
axes[1].scatter([y0], [f(x_fixed, y0)], color=COLORS['highlight'], s=80, zorder=5)
axes[1].set_title(f'Slice at $x={x_fixed}$: slope = $\\partial f/\\partial y$')
axes[1].set_xlabel('$y$'); axes[1].set_ylabel('$f(x_0, y)$')
axes[1].legend(fontsize=10)
fig.tight_layout()
plt.show()
print(f'∂f/∂x at (1.0, 1.0) = {df_dx(1.0,1.0):.4f}')
print(f'∂f/∂y at (0.5, 2.0) = {df_dy(0.5,2.0):.4f}')
Code cell 11
# === 3.3-3.4 Mixed Partials — Clairaut's Theorem ===
# f(x,y) = x^3 * y^2 + sin(xy)
# ∂f/∂x = 3x^2 y^2 + y cos(xy)
# ∂f/∂y = 2x^3 y + x cos(xy)
# ∂²f/∂y∂x = 6x^2 y + cos(xy) - xy sin(xy)
# ∂²f/∂x∂y = 6x^2 y + cos(xy) - xy sin(xy) (same!)
def f2(x, y): return x**3 * y**2 + np.sin(x*y)
def d2f_dydx(x, y): return 6*x**2*y + np.cos(x*y) - x*y*np.sin(x*y)
def d2f_dxdy(x, y): return 6*x**2*y + np.cos(x*y) - x*y*np.sin(x*y)
# Verify via finite differences
h = 1e-5
pts = [(1.0, 2.0), (0.5, -1.0), (np.pi, 1.0)]
print('Verification of Clairaut\'s theorem: ∂²f/∂y∂x = ∂²f/∂x∂y')
print(f'{'Point':>20} {'∂²/∂y∂x':>12} {'∂²/∂x∂y':>12} {'Match':>8}')
for (x0, y0) in pts:
# Numerical ∂²f/∂y∂x: first diff w.r.t. x, then diff w.r.t. y
df_dx_at_yph = (f2(x0+h, y0+h) - f2(x0, y0+h)) / h
df_dx_at_y = (f2(x0+h, y0) - f2(x0, y0)) / h
mixed_yx = (df_dx_at_yph - df_dx_at_y) / h
df_dy_at_xph = (f2(x0+h, y0+h) - f2(x0+h, y0)) / h
df_dy_at_x = (f2(x0, y0+h) - f2(x0, y0)) / h
mixed_xy = (df_dy_at_xph - df_dy_at_x) / h
match = np.isclose(mixed_yx, mixed_xy, rtol=1e-4)
print(f' ({x0:.2f}, {y0:.2f}): {mixed_yx:>12.6f} {mixed_xy:>12.6f} {"PASS" if match else "FAIL"}')
print('\nClairaut\'s theorem: mixed partials are equal for C^2 functions.')
4. The Gradient Vector
The gradient collects all partial derivatives into a single column vector:
Key theorem: points in the direction of steepest ascent, with magnitude equal to the maximum directional derivative.
Code cell 13
# === 4.5 Gradient of Standard ML Forms ===
import numpy.linalg as la
np.random.seed(42)
n = 5
x = np.random.randn(n)
a = np.random.randn(n)
A = np.random.randn(n, n)
A_sym = (A + A.T) / 2 # make symmetric
print('=== Gradient formulas verification ===')
# 1. nabla(a^T x) = a
f1 = lambda x: a @ x
grad1_analytic = a.copy()
grad1_numeric = np.array([(f1(x + 1e-5*np.eye(n)[i]) - f1(x - 1e-5*np.eye(n)[i])) / 2e-5
for i in range(n)])
ok1 = np.allclose(grad1_analytic, grad1_numeric, atol=1e-5)
print(f'PASS - nabla(a^T x) = a' if ok1 else f'FAIL - nabla(a^T x)')
# 2. nabla(x^T x) = 2x
f2 = lambda x: x @ x
grad2_analytic = 2 * x
grad2_numeric = np.array([(f2(x + 1e-5*np.eye(n)[i]) - f2(x - 1e-5*np.eye(n)[i])) / 2e-5
for i in range(n)])
ok2 = np.allclose(grad2_analytic, grad2_numeric, atol=1e-5)
print(f'PASS - nabla(x^T x) = 2x' if ok2 else f'FAIL - nabla(x^T x)')
# 3. nabla(x^T A x) = (A + A^T) x
f3 = lambda x: x @ A @ x
grad3_analytic = (A + A.T) @ x
grad3_numeric = np.array([(f3(x + 1e-5*np.eye(n)[i]) - f3(x - 1e-5*np.eye(n)[i])) / 2e-5
for i in range(n)])
ok3 = np.allclose(grad3_analytic, grad3_numeric, atol=1e-5)
print(f'PASS - nabla(x^T A x) = (A+A^T)x' if ok3 else f'FAIL - nabla(x^T A x)')
# 4. nabla(x^T A_sym x) = 2 A_sym x
f4 = lambda x: x @ A_sym @ x
grad4_analytic = 2 * A_sym @ x
grad4_numeric = np.array([(f4(x + 1e-5*np.eye(n)[i]) - f4(x - 1e-5*np.eye(n)[i])) / 2e-5
for i in range(n)])
ok4 = np.allclose(grad4_analytic, grad4_numeric, atol=1e-5)
print(f'PASS - nabla(x^T A_sym x) = 2 A_sym x (A sym)' if ok4 else 'FAIL')
# 5. nabla ||Ax - b||^2 = 2 A^T(Ax - b)
m = 8
B = np.random.randn(m, n)
b = np.random.randn(m)
f5 = lambda x: la.norm(B @ x - b)**2
grad5_analytic = 2 * B.T @ (B @ x - b)
grad5_numeric = np.array([(f5(x + 1e-5*np.eye(n)[i]) - f5(x - 1e-5*np.eye(n)[i])) / 2e-5
for i in range(n)])
ok5 = np.allclose(grad5_analytic, grad5_numeric, atol=1e-4)
print(f'PASS - nabla ||Bx-b||^2 = 2B^T(Bx-b)' if ok5 else 'FAIL')
4.2 The Gradient Points in the Direction of Steepest Ascent
Proof via Cauchy-Schwarz:
Equality holds when . The gradient IS the direction of steepest ascent.
Code cell 15
# === 4.2 Steepest Ascent Direction Demo ===
import numpy as np
import matplotlib.pyplot as plt
COLORS = {'primary': '#0077BB', 'secondary': '#EE7733',
'tertiary': '#009988', 'error': '#CC3311',
'neutral': '#555555', 'highlight': '#EE3377'}
def f_test(x, y): return np.sin(x) * np.cos(y) + 0.3*(x**2 + y**2)
def grad_f(x, y):
return np.array([np.cos(x)*np.cos(y) + 0.6*x,
-np.sin(x)*np.sin(y) + 0.6*y])
# At a specific point, compute directional derivatives in many directions
a = np.array([1.0, 0.5])
g = grad_f(*a)
grad_norm = np.linalg.norm(g)
thetas = np.linspace(0, 2*np.pi, 360)
dir_derivs = []
for theta in thetas:
u = np.array([np.cos(theta), np.sin(theta)])
dir_derivs.append(g @ u) # = ||g|| cos(angle)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
fig.suptitle('Directional Derivative vs. Direction', fontsize=15)
axes[0].plot(np.degrees(thetas), dir_derivs, color=COLORS['primary'])
max_idx = np.argmax(dir_derivs)
axes[0].axvline(np.degrees(thetas[max_idx]), color=COLORS['error'],
ls='--', label=f'Max at θ={np.degrees(thetas[max_idx]):.1f}°')
axes[0].axhline(grad_norm, color=COLORS['tertiary'],
ls=':', label=f'||∇f|| = {grad_norm:.3f}')
axes[0].set_xlabel('Direction angle θ (degrees)')
axes[0].set_ylabel('$D_u f$')
axes[0].set_title('Directional derivative at $(1, 0.5)$')
axes[0].legend()
# Right: gradient field
xg, yg = np.meshgrid(np.linspace(-2, 2, 12), np.linspace(-2, 2, 12))
Gx = np.cos(xg)*np.cos(yg) + 0.6*xg
Gy = -np.sin(xg)*np.sin(yg) + 0.6*yg
axes[1].quiver(xg, yg, Gx, Gy, color=COLORS['primary'], alpha=0.75)
axes[1].scatter(*a, color=COLORS['error'], s=100, zorder=5, label='Query point')
axes[1].arrow(*a, *(0.4*g/grad_norm), head_width=0.08, color=COLORS['error'])
axes[1].set_title('Gradient field $\\nabla f$')
axes[1].set_xlabel('$x$'); axes[1].set_ylabel('$y$')
axes[1].legend()
fig.tight_layout()
plt.show()
grad_dir = g / grad_norm
max_dir = np.array([np.cos(thetas[max_idx]), np.sin(thetas[max_idx])])
print(f'Gradient direction: {grad_dir}')
print(f'Direction of max D_u f: {max_dir}')
print(f'PASS - match' if np.allclose(grad_dir, max_dir, atol=0.01) else 'FAIL')
5. Directional Derivatives
The directional derivative gives the rate of change in any direction (unit vector). It is maximized in direction and zero along level sets.
Code cell 17
# === 5.3 Extremizing the Directional Derivative ===
import numpy as np
import numpy.linalg as la
# f(x,y) = e^(x^2 + y^2) at a = (1, 0)
def f_exp(x, y): return np.exp(x**2 + y**2)
def grad_exp(x, y): return np.array([2*x*np.exp(x**2+y**2),
2*y*np.exp(x**2+y**2)])
a = (1.0, 0.0)
g = grad_exp(*a)
print(f'f(1,0) = {f_exp(*a):.4f}')
print(f'nabla f(1,0) = {g}')
print(f'||nabla f(1,0)|| = {la.norm(g):.4f}')
# Sample directions and compute directional derivatives
test_dirs = {
'Steepest ascent (grad dir)': g / la.norm(g),
'North (0,1)': np.array([0., 1.]),
'East (1,0)': np.array([1., 0.]),
'Steepest descent': -g / la.norm(g),
'Along level set': np.array([0., 1.]) if np.isclose(g[1],0) else
np.array([-g[1], g[0]]) / la.norm(g),
}
print()
print(f'{'Direction':>30} {'u':>14} {'D_u f':>8}')
for name, u in test_dirs.items():
u = u / la.norm(u) # ensure unit
Du = g @ u
print(f' {name:>28} {str(u.round(3)):>14} {Du:>8.4f}')
print(f'\nMax D_u f = ||grad|| = {la.norm(g):.4f}')
ok = np.isclose(max([g@(u/la.norm(u)) for u in test_dirs.values()]), la.norm(g), atol=0.01)
print(f'PASS - maximum equals gradient norm' if ok else 'FAIL')
6. Gradient in Machine Learning
We compute and verify gradients for the most important ML loss functions.
Code cell 19
# === 6.1 MSE Gradient — Derivation and Verification ===
import numpy as np
import numpy.linalg as la
np.random.seed(42)
m, d = 50, 4
X = np.random.randn(m, d)
w_true = np.array([2.0, -1.0, 0.5, 1.5])
y = X @ w_true + 0.1 * np.random.randn(m)
def mse_loss(w):
return (1/m) * la.norm(X @ w - y)**2
def mse_grad(w):
return (2/m) * X.T @ (X @ w - y)
w0 = np.zeros(d)
# Verify gradient via centered differences
h = 1e-5
num_grad = np.array([(mse_loss(w0 + h*np.eye(d)[i]) - mse_loss(w0 - h*np.eye(d)[i])) / (2*h)
for i in range(d)])
anal_grad = mse_grad(w0)
print('MSE gradient verification:')
print(f'Analytic: {anal_grad}')
print(f'Numerical: {num_grad}')
rel_err = la.norm(anal_grad - num_grad) / (la.norm(anal_grad) + la.norm(num_grad))
print(f'Relative error: {rel_err:.2e}')
print(f'PASS - MSE gradient correct' if rel_err < 1e-6 else 'FAIL')
# Verify: gradient = 0 at OLS solution
w_ols = la.lstsq(X, y, rcond=None)[0]
grad_at_ols = mse_grad(w_ols)
print(f'\n||nabla L at OLS solution|| = {la.norm(grad_at_ols):.2e} (should be ~0)')
print(f'PASS - gradient zero at OLS' if la.norm(grad_at_ols) < 1e-10 else 'FAIL')
Code cell 20
# === 6.2 Logistic Regression Gradient ===
import numpy as np
import numpy.linalg as la
np.random.seed(42)
m, d = 100, 3
X_cls = np.random.randn(m, d)
w_cls_true = np.array([1.0, -2.0, 0.5])
logits = X_cls @ w_cls_true
y_cls = (logits + 0.5 * np.random.randn(m) > 0).astype(float)
def sigmoid(z): return 1.0 / (1.0 + np.exp(-np.clip(z, -30, 30)))
def ce_loss(w):
p = sigmoid(X_cls @ w)
return -(1/m) * np.sum(y_cls * np.log(p + 1e-10) + (1-y_cls) * np.log(1-p + 1e-10))
def ce_grad(w):
p = sigmoid(X_cls @ w)
return (1/m) * X_cls.T @ (p - y_cls) # elegant formula!
w0 = np.zeros(d)
h = 1e-5
num_grad = np.array([(ce_loss(w0 + h*np.eye(d)[i]) - ce_loss(w0 - h*np.eye(d)[i])) / (2*h)
for i in range(d)])
anal_grad = ce_grad(w0)
rel_err = la.norm(anal_grad - num_grad) / (la.norm(anal_grad) + la.norm(num_grad))
print('Cross-entropy gradient: (1/m) X^T (p - y)')
print(f'Analytic: {anal_grad}')
print(f'Numerical: {num_grad}')
print(f'Relative error: {rel_err:.2e}')
print(f'PASS - CE gradient correct' if rel_err < 1e-6 else 'FAIL')
Code cell 21
# === 6.5 Gradient Descent on MSE Loss ===
import numpy as np
import numpy.linalg as la
import matplotlib.pyplot as plt
COLORS = {'primary': '#0077BB', 'secondary': '#EE7733',
'tertiary': '#009988', 'error': '#CC3311',
'neutral': '#555555', 'highlight': '#EE3377'}
np.random.seed(42)
m, d = 50, 4
X = np.random.randn(m, d)
w_true = np.array([2.0, -1.0, 0.5, 1.5])
y = X @ w_true + 0.1 * np.random.randn(m)
def mse_loss(w): return (1/m) * la.norm(X @ w - y)**2
def mse_grad(w): return (2/m) * X.T @ (X @ w - y)
# Run gradient descent with multiple learning rates
T = 200
lrs = [0.001, 0.01, 0.1]
fig, ax = plt.subplots(figsize=(10, 5))
for lr, color in zip(lrs, [COLORS['primary'], COLORS['secondary'], COLORS['error']]):
w = np.zeros(d)
losses = [mse_loss(w)]
for _ in range(T):
w -= lr * mse_grad(w)
losses.append(mse_loss(w))
ax.semilogy(losses, color=color, label=f'$\\eta={lr}$')
# OLS optimal loss (lower bound)
w_ols = la.lstsq(X, y, rcond=None)[0]
ax.axhline(mse_loss(w_ols), ls='--', color=COLORS['neutral'], label='OLS optimum')
ax.set_title('Gradient Descent on MSE — Effect of Learning Rate')
ax.set_xlabel('Step $t$'); ax.set_ylabel('Loss (log scale)')
ax.legend()
fig.tight_layout()
plt.show()
# Verify convergence
w = np.zeros(d)
for _ in range(500): w -= 0.01 * mse_grad(w)
print(f'GD solution: {w}')
print(f'OLS solution: {w_ols}')
ok = np.allclose(w, w_ols, atol=0.01)
print(f'PASS - GD converges to OLS solution' if ok else 'FAIL')
Code cell 22
# === 6.5 2D Loss Landscape with Gradient Descent Path ===
import numpy as np
import matplotlib.pyplot as plt
COLORS = {'primary': '#0077BB', 'secondary': '#EE7733',
'tertiary': '#009988', 'error': '#CC3311',
'neutral': '#555555', 'highlight': '#EE3377'}
# 2D quadratic loss: L(w1, w2) = (w1 - 1)^2 + 10*(w2 - 1)^2
# (elongated ellipse — ill-conditioned)
def loss_2d(w): return (w[0]-1)**2 + 10*(w[1]-1)**2
def grad_2d(w): return np.array([2*(w[0]-1), 20*(w[1]-1)])
w1, w2 = np.meshgrid(np.linspace(-1, 3, 200), np.linspace(-1, 3, 200))
L = (w1-1)**2 + 10*(w2-1)**2
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
fig.suptitle('Gradient Descent on Elongated Loss Landscape', fontsize=15)
for ax, lr, title in zip(axes, [0.05, 0.09],
['$\\eta=0.05$ (slow but stable)', '$\\eta=0.09$ (near oscillation)']):
cp = ax.contourf(w1, w2, L, levels=30, cmap='plasma')
fig.colorbar(cp, ax=ax)
ax.contour(w1, w2, L, levels=10, colors='white', alpha=0.3)
# Run GD
w = np.array([-0.5, -0.5])
path = [w.copy()]
for _ in range(40):
w = w - lr * grad_2d(w)
path.append(w.copy())
path = np.array(path)
ax.plot(path[:,0], path[:,1], 'o-', color=COLORS['error'],
ms=3, lw=1.5, label='GD path')
ax.scatter([1], [1], marker='*', s=200, color='gold', zorder=5, label='Minimum (1,1)')
ax.set_xlim(-1, 3); ax.set_ylim(-1, 3)
ax.set_xlabel('$w_1$'); ax.set_ylabel('$w_2$')
ax.set_title(title)
ax.legend(fontsize=9)
fig.tight_layout()
plt.show()
print('Elongated contours cause GD to zigzag — motivates Adam and preconditioning.')
7. Numerical Differentiation
Gradient checking uses centered finite differences to verify analytical gradient implementations.
Code cell 24
# === 7.1-7.2 Finite Difference Error vs. Step Size ===
import numpy as np
import matplotlib.pyplot as plt
COLORS = {'primary': '#0077BB', 'secondary': '#EE7733',
'tertiary': '#009988', 'error': '#CC3311',
'neutral': '#555555', 'highlight': '#EE3377'}
# f(x) = exp(sin(x^2)) at x = 1.0
f = lambda x: np.exp(np.sin(x**2))
df = lambda x: np.exp(np.sin(x**2)) * np.cos(x**2) * 2*x # analytic
x0 = 1.0
true_grad = df(x0)
h_vals = np.logspace(-12, 0, 100)
fwd_err, cen_err = [], []
for h in h_vals:
fwd = (f(x0 + h) - f(x0)) / h
cen = (f(x0 + h) - f(x0 - h)) / (2*h)
fwd_err.append(abs(fwd - true_grad))
cen_err.append(abs(cen - true_grad))
fig, ax = plt.subplots(figsize=(10, 6))
ax.loglog(h_vals, fwd_err, color=COLORS['secondary'], label='Forward diff: $O(h)$ error')
ax.loglog(h_vals, cen_err, color=COLORS['primary'], label='Centered diff: $O(h^2)$ error')
ax.axvline(1e-5, ls='--', color=COLORS['neutral'], label='Optimal $h \\approx 10^{-5}$')
# Reference lines
h_ref = np.logspace(-7, -1, 20)
ax.loglog(h_ref, 2e-8*h_ref, 'k-', alpha=0.4, label='$O(h)$ slope')
ax.loglog(h_ref, 2e-8*h_ref**2, 'k--', alpha=0.4, label='$O(h^2)$ slope')
ax.set_title('Numerical Gradient Error vs. Step Size $h$')
ax.set_xlabel('Step size $h$')
ax.set_ylabel('|error|')
ax.legend()
fig.tight_layout()
plt.show()
# Find optimal h for centered differences
opt_idx = np.argmin(cen_err)
print(f'Optimal h for centered differences: {h_vals[opt_idx]:.2e}')
print(f'Minimum error: {cen_err[opt_idx]:.2e}')
print(f'\nKey insight: optimal h ≈ epsilon_mach^(1/3) ≈ {np.cbrt(np.finfo(float).eps):.2e}')
Code cell 25
# === 7.3 Gradient Checking Implementation ===
import numpy as np
import numpy.linalg as la
def numerical_gradient(f, x, h=1e-5):
"""Centered-difference gradient for f: R^n -> R."""
n = len(x)
grad = np.zeros(n)
for i in range(n):
e = np.zeros(n); e[i] = 1.0
grad[i] = (f(x + h*e) - f(x - h*e)) / (2*h)
return grad
def gradient_check(f, grad_f, x, h=1e-5, tol=1e-5):
"""Compare analytic and numeric gradients. Returns relative error."""
g_anal = grad_f(x)
g_num = numerical_gradient(f, x, h)
rel_err = la.norm(g_anal - g_num) / (la.norm(g_anal) + la.norm(g_num) + 1e-15)
status = 'PASS' if rel_err < tol else 'FAIL'
return rel_err, status, g_anal, g_num
# Test on several functions
np.random.seed(42)
n = 6
x0 = np.random.randn(n)
A = np.random.randn(n, n)
A_sym = (A + A.T) / 2
b = np.random.randn(n)
tests = [
('a^T x', lambda x: b @ x, lambda x: b),
('x^T x', lambda x: x @ x, lambda x: 2*x),
('x^T A_sym x', lambda x: x @ A_sym @ x, lambda x: 2*A_sym @ x),
('||Ax-b||^2', lambda x: la.norm(A@x-b)**2, lambda x: 2*A.T@(A@x-b)),
('log(|x|^2)', lambda x: np.sum(np.log(x**2+1)), lambda x: 2*x/(x**2+1)),
]
print(f'{'Function':>20} {'Rel. Error':>12} Status')
for name, f_fn, g_fn in tests:
rel_err, status, _, _ = gradient_check(f_fn, g_fn, x0)
print(f' {name:>18} {rel_err:>12.2e} {status}')
print('\nGradient checking: centered differences with h=1e-5 achieves ~1e-10 accuracy.')
8. Loss Landscape Geometry
Neural network loss functions are non-convex surfaces with local minima, saddle points, and flat regions. The gradient norm and Hessian eigenvalues describe the local geometry.
Code cell 27
# === 8.1 Non-Convex Loss Landscape Visualization ===
import numpy as np
import matplotlib.pyplot as plt
COLORS = {'primary': '#0077BB', 'secondary': '#EE7733',
'tertiary': '#009988', 'error': '#CC3311',
'neutral': '#555555', 'highlight': '#EE3377'}
# Modified Beale function (non-convex, multiple saddle points)
def beale(x, y):
t1 = (1.5 - x + x*y)**2
t2 = (2.25 - x + x*y**2)**2
t3 = (2.625 - x + x*y**3)**2
return t1 + t2 + t3
x = np.linspace(-3, 3, 300)
y = np.linspace(-1.5, 1.5, 300)
X, Y = np.meshgrid(x, y)
Z = beale(X, Y)
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
# Contour plot
cp = axes[0].contourf(X, Y, np.log1p(Z), levels=40, cmap='plasma')
fig.colorbar(cp, ax=axes[0], label='log(1 + Loss)')
axes[0].contour(X, Y, np.log1p(Z), levels=15, colors='white', alpha=0.3, lw=0.5)
axes[0].scatter([3], [0.5], marker='*', s=300, color='gold', zorder=5, label='Global min')
axes[0].set_title('Non-convex loss landscape (Beale function)')
axes[0].set_xlabel('$w_1$'); axes[0].set_ylabel('$w_2$')
axes[0].legend()
# Gradient descent from multiple starts
def grad_beale(x, y, h=1e-5):
gx = (beale(x+h, y) - beale(x-h, y)) / (2*h)
gy = (beale(x, y+h) - beale(x, y-h)) / (2*h)
return gx, gy
cp2 = axes[1].contourf(X, Y, np.log1p(Z), levels=40, cmap='plasma')
starts = [(-2, -1), (-1, 1), (0.5, -1), (1.5, 0.5)]
path_colors = [COLORS['error'], COLORS['tertiary'], COLORS['primary'], COLORS['highlight']]
for (x0, y0), color in zip(starts, path_colors):
px, py = [x0], [y0]
x_c, y_c = x0, y0
for _ in range(200):
gx, gy = grad_beale(x_c, y_c)
norm = np.sqrt(gx**2 + gy**2) + 1e-10
x_c -= 0.005 * gx
y_c -= 0.005 * gy
px.append(x_c); py.append(y_c)
axes[1].plot(px, py, '-', color=color, alpha=0.8, lw=1.5,
label=f'start ({x0},{y0})')
axes[1].scatter([3], [0.5], marker='*', s=300, color='gold', zorder=5, label='Global min')
axes[1].set_xlim(-3, 3); axes[1].set_ylim(-1.5, 1.5)
axes[1].set_title('GD trajectories (different initializations)')
axes[1].set_xlabel('$w_1$'); axes[1].set_ylabel('$w_2$')
axes[1].legend(fontsize=8)
fig.tight_layout()
plt.show()
print('Non-convex landscape: different initializations converge to different local minima.')
Code cell 28
# === 8.2-8.3 Gradient Norm as Diagnostic + Gradient Clipping ===
import numpy as np
import matplotlib.pyplot as plt
COLORS = {'primary': '#0077BB', 'secondary': '#EE7733',
'tertiary': '#009988', 'error': '#CC3311',
'neutral': '#555555', 'highlight': '#EE3377'}
np.random.seed(42)
# Simulate gradient descent on a simple problem with gradient clipping
m, d = 100, 10
X = np.random.randn(m, d)
w_true = np.random.randn(d)
y = X @ w_true + 0.1 * np.random.randn(m)
def loss(w): return (1/m) * np.sum((X@w - y)**2)
def grad(w): return (2/m) * X.T @ (X@w - y)
def clip_grad(g, max_norm=1.0):
norm = np.linalg.norm(g)
if norm > max_norm:
return g * max_norm / norm, norm
return g, norm
T = 300
results = {}
for clip_val in [None, 0.5, 2.0]:
w = np.ones(d) * 3 # bad init
losses_t, norms_t = [], []
for t in range(T):
g = grad(w)
if clip_val is not None:
g, norm = clip_grad(g, clip_val)
else:
norm = np.linalg.norm(g)
losses_t.append(loss(w))
norms_t.append(norm)
w -= 0.1 * g
results[clip_val] = (losses_t, norms_t)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
labels = {None: 'No clipping', 0.5: 'Clip norm=0.5', 2.0: 'Clip norm=2.0'}
colors = [COLORS['primary'], COLORS['error'], COLORS['tertiary']]
for (clip_val, (ls, ns)), color in zip(results.items(), colors):
axes[0].semilogy(ls, color=color, label=labels[clip_val])
axes[1].semilogy(ns, color=color, label=labels[clip_val])
axes[0].set_title('Loss vs. Training Step'); axes[0].set_xlabel('Step')
axes[0].set_ylabel('Loss'); axes[0].legend()
axes[1].set_title('Gradient Norm vs. Step'); axes[1].set_xlabel('Step')
axes[1].set_ylabel('||∇L||'); axes[1].legend()
axes[1].axhline(1.0, ls='--', color=COLORS['neutral'], label='Clip threshold 1.0')
fig.tight_layout()
plt.show()
print('Gradient clipping: rescales gradient direction when norm exceeds threshold.')
print('Direction preserved. Used in all LLM training (GPT-3, Llama, Mistral).')
9. Advanced Topics
9.3 Natural Gradient vs. Gradient Descent
The Fisher information matrix defines a Riemannian metric on the statistical manifold. The natural gradient accounts for the non-Euclidean geometry and often converges faster than ordinary gradient descent.
Code cell 30
# === 9.3 Natural Gradient vs. Gradient Descent ===
import numpy as np
import numpy.linalg as la
import matplotlib.pyplot as plt
COLORS = {'primary': '#0077BB', 'secondary': '#EE7733',
'tertiary': '#009988', 'error': '#CC3311',
'neutral': '#555555', 'highlight': '#EE3377'}
np.random.seed(42)
# Fit Gaussian: theta = (mu, log_sigma)
# True params: mu=2, sigma=1 (log_sigma=0)
n_data = 200
x_data = np.random.normal(2.0, 1.0, n_data)
def neg_loglik(theta):
mu, log_sig = theta
sig = np.exp(log_sig)
return 0.5*np.log(2*np.pi) + log_sig + 0.5*np.mean((x_data - mu)**2) / sig**2
def grad_nll(theta):
mu, log_sig = theta
sig = np.exp(log_sig)
sig2 = sig**2
g_mu = -np.mean(x_data - mu) / sig2
g_ls = 1 - np.mean((x_data - mu)**2) / sig2
return np.array([g_mu, g_ls])
def fisher_gaussian(theta):
mu, log_sig = theta
sig = np.exp(log_sig)
# F = diag(1/sig^2, 2)
return np.diag([1/sig**2, 2.0])
theta0 = np.array([0.0, 1.0]) # start: mu=0, sigma=exp(1)≈2.7
T = 100
# Gradient descent
th_gd = theta0.copy()
path_gd = [th_gd.copy()]
for _ in range(T):
th_gd = th_gd - 0.1 * grad_nll(th_gd)
path_gd.append(th_gd.copy())
path_gd = np.array(path_gd)
# Natural gradient descent
th_ng = theta0.copy()
path_ng = [th_ng.copy()]
for _ in range(T):
g = grad_nll(th_ng)
F = fisher_gaussian(th_ng)
nat_g = la.solve(F, g) # F^{-1} g
th_ng = th_ng - 0.1 * nat_g
path_ng.append(th_ng.copy())
path_ng = np.array(path_ng)
# Plot trajectories in (mu, log_sigma) space
fig, ax = plt.subplots(figsize=(8, 7))
ax.plot(path_gd[:,0], path_gd[:,1], 'o-', color=COLORS['primary'],
ms=3, label='Gradient descent')
ax.plot(path_ng[:,0], path_ng[:,1], 's-', color=COLORS['error'],
ms=3, label='Natural gradient descent')
ax.scatter([2.0], [0.0], marker='*', s=300, color='gold', zorder=5, label='True params')
ax.scatter(*theta0, s=100, color=COLORS['neutral'], zorder=5, label='Start')
ax.set_xlabel('$\\mu$'); ax.set_ylabel('$\\log\\sigma$')
ax.set_title('GD vs. Natural Gradient in parameter space')
ax.legend()
fig.tight_layout()
plt.show()
print(f'GD final: mu={path_gd[-1,0]:.3f}, log_sigma={path_gd[-1,1]:.3f}')
print(f'NG final: mu={path_ng[-1,0]:.3f}, log_sigma={path_ng[-1,1]:.3f}')
print(f'True: mu=2.000, log_sigma=0.000')
ng_closer = la.norm(path_ng[-1] - [2,0]) < la.norm(path_gd[-1] - [2,0])
print(f'PASS - natural gradient converges faster' if ng_closer else 'Both converged equally')
9.4 Softmax Gradient Derivation
The Jacobian of is:
Combined with cross-entropy, the gradient simplifies to .
Code cell 32
# === Softmax Jacobian and Cross-Entropy Gradient ===
import numpy as np
import numpy.linalg as la
def softmax(z):
z = z - z.max() # numerical stability
e = np.exp(z)
return e / e.sum()
def softmax_jacobian(p):
"""J[k,j] = p_k * (delta_kj - p_j)"""
return np.diag(p) - np.outer(p, p)
def cross_entropy(z, y_onehot):
p = softmax(z)
return -np.sum(y_onehot * np.log(p + 1e-15))
def ce_grad_z(z, y_onehot):
return softmax(z) - y_onehot # elegant!
np.random.seed(42)
K = 5
z = np.random.randn(K)
y_true = np.zeros(K); y_true[2] = 1.0 # one-hot label for class 2
# Verify gradient
h = 1e-5
num_grad = np.array([(cross_entropy(z + h*np.eye(K)[i], y_true)
- cross_entropy(z - h*np.eye(K)[i], y_true)) / (2*h)
for i in range(K)])
anal_grad = ce_grad_z(z, y_true)
print('Softmax cross-entropy gradient verification:')
print(f'Analytic (p - y): {anal_grad.round(4)}')
print(f'Numerical (FD): {num_grad.round(4)}')
rel_err = la.norm(anal_grad - num_grad) / (la.norm(anal_grad) + la.norm(num_grad))
print(f'Relative error: {rel_err:.2e}')
print(f'PASS - softmax CE gradient correct' if rel_err < 1e-6 else 'FAIL')
# Softmax Jacobian
p = softmax(z)
J = softmax_jacobian(p)
print(f'\nSoftmax Jacobian shape: {J.shape}')
print(f'Jacobian is symmetric: {np.allclose(J, J.T)}')
print(f'Row sums = 0: {np.allclose(J.sum(axis=1), 0)}')
print(f'\nTakeaway: grad of CE + softmax = p - y. Elegant, stable, used in every transformer.')
8. Higher-Order Partial Derivatives and the Hessian Preview
Second-order partial derivatives measure the curvature of a scalar field — how fast the gradient itself changes. The matrix of all second partials is the Hessian , which characterises loss landscape geometry.
Preview: The Hessian is covered in full in §02 — Jacobians and Hessians. Here we only compute it numerically and observe its role.
Code cell 34
# === 8.1 Numerical Hessian via Finite Differences ===
import numpy as np
np.random.seed(42)
def numerical_hessian(f, x, h=1e-4):
"""Compute Hessian via second-order central differences."""
n = len(x)
H = np.zeros((n, n))
for i in range(n):
for j in range(n):
ei, ej = np.eye(n)[i], np.eye(n)[j]
H[i, j] = (f(x + h*ei + h*ej) - f(x + h*ei - h*ej)
- f(x - h*ei + h*ej) + f(x - h*ei - h*ej)) / (4*h**2)
return H
# Test on quadratic f(x,y) = x^2 + 3xy + 2y^2 → H = [[2,3],[3,4]]
f_quad = lambda x: x[0]**2 + 3*x[0]*x[1] + 2*x[1]**2
x0 = np.array([1.0, 2.0])
H_num = numerical_hessian(f_quad, x0)
H_exact = np.array([[2., 3.], [3., 4.]])
print('Numerical Hessian:\n', H_num)
print('Exact Hessian: \n', H_exact)
ok = np.allclose(H_num, H_exact, atol=1e-6)
print(f'\nPASS: {ok} — Hessian matches to 1e-6')
# Eigenvalues tell us curvature directions
evals, evecs = np.linalg.eigh(H_exact)
print(f'\nEigenvalues: {evals} (both positive → local min)')
print(f'Condition number: {evals[-1]/evals[0]:.2f}')
Code cell 35
# === 8.2 Hessian Eigenstructure Visualization ===
try:
import matplotlib.pyplot as plt
import matplotlib
HAS_MPL = True
except ImportError:
HAS_MPL = False
if HAS_MPL:
COLORS = {'blue': '#2166AC', 'red': '#D6604D', 'green': '#4DAC26',
'orange': '#F4A582', 'purple': '#762A83', 'gray': '#878787'}
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = [10, 4]
fig, axes = plt.subplots(1, 2)
x = np.linspace(-2, 2, 300)
y = np.linspace(-2, 2, 300)
X, Y = np.meshgrid(x, y)
Z = X**2 + 3*X*Y + 2*Y**2
# Left: contours of the quadratic
ax = axes[0]
cs = ax.contourf(X, Y, Z, levels=20, cmap='viridis')
plt.colorbar(cs, ax=ax)
# Draw eigenvectors scaled by eigenvalue
for i, (val, vec) in enumerate(zip(evals, evecs.T)):
ax.annotate('', xy=vec*0.7, xytext=(0, 0),
arrowprops=dict(arrowstyle='->', color=list(COLORS.values())[i], lw=2))
ax.text(vec[0]*0.8, vec[1]*0.8, f'λ={val:.1f}',
fontsize=10, color=list(COLORS.values())[i], fontweight='bold')
ax.set_title('Quadratic $f = x^2+3xy+2y^2$\nHessian eigenvectors')
ax.set_xlabel('x'); ax.set_ylabel('y')
# Right: Saddle point example
f_saddle = lambda xy: xy[0]**2 - xy[1]**2
Z2 = X**2 - Y**2
H_saddle = np.array([[2., 0.], [0., -2.]])
evals2, evecs2 = np.linalg.eigh(H_saddle)
ax2 = axes[1]
cs2 = ax2.contourf(X, Y, Z2, levels=20, cmap='RdBu_r')
plt.colorbar(cs2, ax=ax2)
ax2.set_title(f'Saddle $f=x^2-y^2$\nλ=({evals2[0]:.0f},{evals2[1]:.0f}) → saddle')
ax2.set_xlabel('x'); ax2.set_ylabel('y')
plt.tight_layout()
plt.savefig('/tmp/hessian_curvature.png', dpi=100, bbox_inches='tight')
plt.show()
print('Hessian eigenstructure plots saved.')
else:
print('matplotlib not available.')
9. Gradient Flow and Continuous-Time Optimization
The gradient descent iteration becomes, as , the gradient flow ODE:
This ODE perspective gives analytical tools: for a strongly convex with constant , the flow satisfies , an exponential decay. Modern analyses of neural collapse, loss landscape geometry, and sharpness-aware minimization all use gradient flow as the backbone model.
Code cell 37
# === 9.1 Gradient Flow ODE vs Discrete Gradient Descent ===
from scipy.integrate import solve_ivp
# f(x,y) = x^2 + 5y^2 (elongated bowl, condition number 5)
grad_f = lambda xy: np.array([2*xy[0], 10*xy[1]])
f_val = lambda xy: xy[0]**2 + 5*xy[1]**2
# Continuous gradient flow
def flow_rhs(t, theta):
return -grad_f(theta)
theta0 = np.array([2.0, 1.5])
sol = solve_ivp(flow_rhs, [0, 3], theta0, max_step=0.01, dense_output=True)
t_cont = np.linspace(0, 3, 300)
flow_path = sol.sol(t_cont).T
# Discrete gradient descent at two learning rates
def gradient_descent_path(theta0, lr, n_steps):
path = [theta0.copy()]
theta = theta0.copy()
for _ in range(n_steps):
theta = theta - lr * grad_f(theta)
path.append(theta.copy())
return np.array(path)
disc_01 = gradient_descent_path(theta0, lr=0.05, n_steps=60)
disc_05 = gradient_descent_path(theta0, lr=0.15, n_steps=60)
print(f'Gradient flow final loss: {f_val(flow_path[-1]):.2e}')
print(f'GD lr=0.05 final loss: {f_val(disc_01[-1]):.2e}')
print(f'GD lr=0.15 final loss: {f_val(disc_05[-1]):.2e}')
if HAS_MPL if 'HAS_MPL' in dir() else False:
pass # plot below in visualization cell
print('Gradient flow data computed.')
Code cell 38
# === 9.2 Visualize Gradient Flow vs Discrete GD ===
try:
import matplotlib.pyplot as plt
HAS_MPL = True
except ImportError:
HAS_MPL = False
if HAS_MPL:
COLORS = {'blue': '#2166AC', 'red': '#D6604D', 'green': '#4DAC26', 'orange': '#F4A582'}
plt.style.use('seaborn-v0_8-whitegrid')
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
x = np.linspace(-2.5, 2.5, 200)
y = np.linspace(-2, 2, 200)
X, Y = np.meshgrid(x, y)
Z = X**2 + 5*Y**2
# Trajectory plot
ax = axes[0]
ax.contourf(X, Y, Z, levels=15, cmap='viridis', alpha=0.4)
ax.contour(X, Y, Z, levels=15, colors='white', linewidths=0.5, alpha=0.6)
ax.plot(flow_path[:, 0], flow_path[:, 1], '-', color=COLORS['blue'],
lw=2.5, label='Gradient flow (continuous)')
ax.plot(disc_01[:, 0], disc_01[:, 1], 'o-', color=COLORS['red'],
ms=3, lw=1.5, label='GD η=0.05')
ax.plot(disc_05[:, 0], disc_05[:, 1], 's-', color=COLORS['orange'],
ms=3, lw=1.5, label='GD η=0.15')
ax.plot(*theta0, 'k*', ms=12, label='Start')
ax.plot(0, 0, 'k+', ms=15, mew=3, label='Minimum')
ax.set_xlabel('x'); ax.set_ylabel('y')
ax.set_title('Trajectory: gradient flow vs discrete GD')
ax.legend(fontsize=9)
# Loss vs time/steps
ax2 = axes[1]
ax2.semilogy(t_cont, [f_val(p) for p in flow_path], '-', color=COLORS['blue'],
lw=2.5, label='Gradient flow')
steps = np.arange(len(disc_01))
ax2.semilogy(steps * 0.05, [f_val(p) for p in disc_01], 'o-', color=COLORS['red'],
ms=3, lw=1.5, label='GD η=0.05')
ax2.semilogy(steps * 0.15, [f_val(p) for p in disc_05], 's-', color=COLORS['orange'],
ms=3, lw=1.5, label='GD η=0.15')
ax2.set_xlabel('Time / effective time (lr × steps)')
ax2.set_ylabel('Loss (log scale)')
ax2.set_title('Loss convergence')
ax2.legend(fontsize=9)
plt.tight_layout()
plt.savefig('/tmp/gradient_flow.png', dpi=100, bbox_inches='tight')
plt.show()
print('Gradient flow plot saved.')
else:
print('matplotlib not available.')
10. Sparse Gradients and Coordinate-Wise Methods
In large embedding tables (LLMs, recommendation systems), most tokens in a batch do not appear — their gradient rows are exactly zero. This sparsity is exploited by:
- Sparse optimizers that only update embedding rows with non-zero gradients
- LoRA: instead of updating the full weight gradient , parameterise the update as with , and . The effective gradient lives in a low-rank subspace.
Coordinate descent updates one coordinate at a time:
For separable functions (e.g. ) this is exact; for general it converges under mild conditions. Block coordinate descent underpins many alternating optimisation algorithms.
Code cell 40
# === 10.1 Sparse Gradient: Embedding Table Update ===
import numpy as np
np.random.seed(42)
vocab_size, embed_dim = 50000, 768
batch_size, seq_len = 32, 128
# Simulate which token indices appear in a batch
token_ids = np.random.randint(0, vocab_size, size=(batch_size, seq_len))
unique_tokens = np.unique(token_ids)
sparsity = 1 - len(unique_tokens) / vocab_size
print(f'Vocab size: {vocab_size:,}')
print(f'Unique tokens in batch: {len(unique_tokens):,}')
print(f'Gradient sparsity: {sparsity:.4f} ({sparsity*100:.1f}% rows are zero)')
# Dense vs sparse gradient update cost
dense_ops = vocab_size * embed_dim
sparse_ops = len(unique_tokens) * embed_dim
print(f'\nDense update: {dense_ops:,} scalar operations')
print(f'Sparse update: {sparse_ops:,} scalar operations')
print(f'Speedup: {dense_ops/sparse_ops:.1f}x')
# LoRA: low-rank gradient subspace
rank = 8
m, n = 4096, 4096
lora_params = rank * (m + n)
full_params = m * n
print(f'\nLoRA r={rank}: {lora_params:,} params vs full {full_params:,}')
print(f'Parameter reduction: {full_params/lora_params:.0f}x')
11. Memory vs Computation Trade-offs in Gradient Computation
Computing for a deep network requires storing all intermediate activations during the forward pass so that the backward pass can use them.
For a Transformer with layers, tokens, hidden size :
| Method | Memory | Compute |
|---|---|---|
| Full storage | ||
| Gradient checkpointing (every layers) | ||
| Rematerialisation (no storage) |
Gradient checkpointing (PyTorch: torch.utils.checkpoint) trades a 33% compute overhead for a memory reduction — critical for training long-context models.
→ Automatic differentiation and its memory model: §05 — Automatic Differentiation
Code cell 42
# === 11.1 Gradient Statistics Across Training ===
import numpy as np
np.random.seed(42)
# Simulate gradient norms across training steps
# Typical pattern: high early, decreasing, occasional spikes
n_steps = 500
t = np.arange(n_steps)
# Base decaying norm + noise + occasional spikes
base = 2.0 * np.exp(-t / 200) + 0.1
noise = 0.05 * np.random.randn(n_steps)
spike_locs = np.random.choice(n_steps, size=15, replace=False)
spikes = np.zeros(n_steps)
spikes[spike_locs] = np.random.exponential(1.5, size=15)
grad_norms = np.abs(base + noise + spikes)
clip_threshold = 1.0
clipped_norms = np.minimum(grad_norms, clip_threshold)
print(f'Max gradient norm: {grad_norms.max():.3f}')
print(f'Mean gradient norm: {grad_norms.mean():.3f}')
print(f'Fraction clipped: {(grad_norms > clip_threshold).mean():.3f}')
print(f'Mean after clipping: {clipped_norms.mean():.3f}')
try:
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8-whitegrid')
fig, ax = plt.subplots(figsize=(10, 4))
ax.plot(t, grad_norms, alpha=0.7, color='#D6604D', lw=1, label='Raw norm')
ax.plot(t, clipped_norms, alpha=0.9, color='#2166AC', lw=1.5, label=f'Clipped (max={clip_threshold})')
ax.axhline(clip_threshold, color='#4DAC26', ls='--', lw=1.5, label='Clip threshold')
ax.set_xlabel('Training step'); ax.set_ylabel('‖∇‖₂')
ax.set_title('Gradient norm during training: raw vs clipped')
ax.legend()
plt.tight_layout()
plt.savefig('/tmp/grad_stats.png', dpi=100, bbox_inches='tight')
plt.show()
print('Gradient statistics plot saved.')
except ImportError:
print('matplotlib not available.')
12. Wirtinger Calculus for Complex-Valued Functions
Modern neural networks use complex-valued operations in signal processing, MRI reconstruction, and some attention variants. For (a real-valued function of complex variables), the standard derivative does not exist unless is holomorphic — but real-valued losses typically are not.
Wirtinger derivatives split complex differentiation into:
where . For real-valued , the gradient direction that decreases most steeply is — the conjugate Wirtinger gradient. PyTorch uses this convention for complex tensors.
Code cell 44
# === 12.1 Wirtinger Gradient Demo ===
import numpy as np
# f(z) = |z - z0|^2 = (x-x0)^2 + (y-y0)^2
# Wirtinger: df/dz_bar = (x-x0) + i(y-y0) = z - z0
z0 = 1.0 + 2.0j
z = 3.0 - 1.0j
f = lambda z: abs(z - z0)**2
# Numerical Wirtinger gradient: conjugate Wirtinger ∂f/∂z*
h = 1e-7
# ∂f/∂x ≈ (f(z+h) - f(z-h)) / (2h)
# ∂f/∂y ≈ (f(z+ih) - f(z-ih)) / (2h)
dfdx = (f(z + h) - f(z - h)) / (2*h)
dfdy = (f(z + 1j*h) - f(z - 1j*h)) / (2*h)
wirtinger_conj = 0.5 * (dfdx + 1j * dfdy) # ∂f/∂z̄
exact_conj = z - z0 # known closed form
print(f'Wirtinger ∂f/∂z̄ (numerical): {wirtinger_conj:.6f}')
print(f'Exact (z - z0): {exact_conj:.6f}')
ok = abs(wirtinger_conj - exact_conj) < 1e-5
print(f'PASS: {ok} — numerical matches exact')
# Gradient step: move z in direction -(df/dz_bar)
lr = 0.5
z_new = z - lr * wirtinger_conj
print(f'\nz = {z:.4f}, f(z) = {f(z):.4f}')
print(f'z_new = {z_new:.4f}, f(z_new) = {f(z_new):.4f}')
print(f'Minimum is at z0 = {z0:.4f}')
13. Summary and Connections
Key results from this notebook
| Result | Formula | Where Used |
|---|---|---|
| Gradient definition | Every optimizer | |
| Steepest ascent | Gradient descent direction | |
| Directional derivative | Sensitivity analysis | |
| MSE gradient | Linear regression | |
| CE gradient | Logistic regression, LLMs | |
| Gradient checking | relative error | Debugging backprop |
| Gradient clipping | Stable training |
What comes next
- §02 — Jacobians and Hessians: the gradient generalises to vector-valued functions (Jacobian) and the Hessian captures second-order curvature
- §03 — Chain Rule and Backpropagation: how gradients flow through composed functions; the mathematical engine of neural networks
- §04 — Optimality Conditions: when does guarantee a minimum? Lagrange multipliers, KKT theory
- §05 — Automatic Differentiation: how PyTorch and JAX compute all of the above exactly and efficiently
14. Convergence Theory for Gradient Descent
For an -smooth convex function (), gradient descent with step size satisfies:
This rate is tight for convex functions. For -strongly convex functions:
a geometric (linear) rate. The ratio is the condition number — elongated loss landscapes (large ) slow convergence dramatically. Preconditioning (Adam, Adagrad, natural gradient) aims to reduce effective .
Code cell 47
# === 14.1 Gradient Descent Convergence Rates ===
import numpy as np
np.random.seed(42)
# Quadratic f(x) = (1/2) x^T A x with A = diag(mu, L)
# Condition numbers to compare
conditions = [1, 5, 20, 100]
n_steps = 200
mu = 1.0
def gd_convergence(kappa, n_steps):
L = kappa * mu
eta = 1.0 / L # optimal step size
# Initial loss = 1.0 (normalised)
# Linear convergence: loss_t = (1 - mu/L)^t
t = np.arange(n_steps + 1)
rate = 1 - mu / L
return t, rate ** t
try:
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8-whitegrid')
fig, ax = plt.subplots(figsize=(10, 5))
cmap = plt.cm.plasma
colors = [cmap(i/len(conditions)) for i in range(len(conditions))]
for kappa, color in zip(conditions, colors):
t, losses = gd_convergence(kappa, n_steps)
ax.semilogy(t, losses, lw=2, color=color, label=f'κ={kappa}, η=1/L')
# Mark steps to reach 1e-4 precision
idx = np.searchsorted(-losses, -1e-4)
if idx < n_steps:
ax.axvline(idx, color=color, lw=0.8, ls=':', alpha=0.7)
ax.set_xlabel('Gradient descent step')
ax.set_ylabel('$f(\\theta_t) - f^*$ (log scale)')
ax.set_title('Convergence rate vs condition number κ = L/μ')
ax.legend()
ax.set_xlim(0, n_steps)
plt.tight_layout()
plt.savefig('/tmp/gd_convergence.png', dpi=100, bbox_inches='tight')
plt.show()
print('Convergence plot saved.')
except ImportError:
print('matplotlib not available.')
# Analytical steps to 1e-4 precision
print('\nSteps to reach 1e-4 precision:')
for kappa in conditions:
L = kappa * mu
rate = 1 - mu / L
steps = int(np.ceil(np.log(1e-4) / np.log(rate)))
print(f' κ={kappa:3d}: {steps:5d} steps (rate={rate:.4f})')
Code cell 48
# === 14.2 Adaptive vs Standard Gradient Descent ===
import numpy as np
np.random.seed(42)
# Ill-conditioned quadratic: f(x,y) = 0.5*(x^2/0.01 + y^2) → condition number 100
def f_ill(xy): return 0.5 * (xy[0]**2 / 0.01 + xy[1]**2)
def grad_ill(xy): return np.array([xy[0] / 0.01, xy[1]])
theta0 = np.array([0.5, 1.0])
n_steps = 150
# Standard GD with η = 0.005 (safe for L=100)
def run_gd(theta0, lr, n_steps):
theta = theta0.copy()
path = [theta.copy()]
for _ in range(n_steps):
theta -= lr * grad_ill(theta)
path.append(theta.copy())
return np.array(path)
# Adam (diagonal preconditioning approximates the Hessian inverse)
def run_adam(theta0, lr, n_steps, b1=0.9, b2=0.999, eps=1e-8):
theta = theta0.copy()
m = np.zeros_like(theta)
v = np.zeros_like(theta)
path = [theta.copy()]
for t in range(1, n_steps + 1):
g = grad_ill(theta)
m = b1 * m + (1 - b1) * g
v = b2 * v + (1 - b2) * g**2
m_hat = m / (1 - b1**t)
v_hat = v / (1 - b2**t)
theta -= lr * m_hat / (np.sqrt(v_hat) + eps)
path.append(theta.copy())
return np.array(path)
gd_path = run_gd(theta0, lr=0.005, n_steps=n_steps)
adam_path = run_adam(theta0, lr=0.1, n_steps=n_steps)
gd_losses = [f_ill(p) for p in gd_path]
adam_losses = [f_ill(p) for p in adam_path]
print(f'GD final loss: {gd_losses[-1]:.6f}')
print(f'Adam final loss: {adam_losses[-1]:.6f}')
print(f'Adam speedup factor: ~{gd_losses[50]/adam_losses[50]:.0f}x at step 50')
try:
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8-whitegrid')
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
x = np.linspace(-0.6, 0.6, 200)
y = np.linspace(-1.2, 1.2, 200)
X, Y = np.meshgrid(x, y)
Z = 0.5 * (X**2/0.01 + Y**2)
ax = axes[0]
ax.contourf(X, Y, Z, levels=20, cmap='viridis', alpha=0.5)
ax.plot(gd_path[:,0], gd_path[:,1], 'o-', ms=2, lw=1.5, color='#D6604D', label='GD η=0.005')
ax.plot(adam_path[:,0], adam_path[:,1], 's-', ms=2, lw=1.5, color='#2166AC', label='Adam lr=0.1')
ax.set_xlabel('x'); ax.set_ylabel('y')
ax.set_title('Trajectory (κ=100)')
ax.legend(fontsize=9)
axes[1].semilogy(gd_losses, color='#D6604D', lw=2, label='GD')
axes[1].semilogy(adam_losses, color='#2166AC', lw=2, label='Adam')
axes[1].set_xlabel('Step'); axes[1].set_ylabel('Loss (log)')
axes[1].set_title('Loss convergence')
axes[1].legend()
plt.tight_layout()
plt.savefig('/tmp/adam_vs_gd.png', dpi=100, bbox_inches='tight')
plt.show()
print('Adam vs GD plot saved.')
except ImportError:
print('matplotlib not available.')
15. Stochastic Gradients and Mini-Batch Estimation
In practice, the true gradient is computed over the full dataset — prohibitively expensive for tokens.
Stochastic gradient descent (SGD) uses a mini-batch estimate:
Statistical properties:
- — the mini-batch gradient is unbiased
- — variance decreases as ; larger batches give more accurate gradient estimates
- Gradient noise acts as implicit regularisation, helping escape sharp minima
The signal-to-noise ratio guides batch size selection — when SNR , increasing batch size helps; when SNR , larger batches add diminishing returns.
Code cell 50
# === 15.1 Mini-Batch Gradient Variance vs Batch Size ===
import numpy as np
np.random.seed(42)
# Dataset: N samples from a linear model y = 2x + noise
N = 10000
x_data = np.random.randn(N)
y_data = 2.0 * x_data + 0.5 * np.random.randn(N)
# True gradient of MSE w.r.t. w: (2/N) sum_i x_i*(x_i*w - y_i)
w = 1.0 # current weight estimate
true_grad = (2.0 / N) * np.sum(x_data * (w * x_data - y_data))
# Mini-batch gradient variance vs batch size
batch_sizes = [1, 8, 32, 128, 512, 2048]
n_trials = 500
print(f'True gradient: {true_grad:.6f}')
print(f'\nBatch size | Mean estimate | Std dev | Variance reduction')
print('-' * 65)
grad_stds = []
for B in batch_sizes:
estimates = []
for _ in range(n_trials):
idx = np.random.choice(N, size=B, replace=False)
xb, yb = x_data[idx], y_data[idx]
g = (2.0 / B) * np.sum(xb * (w * xb - yb))
estimates.append(g)
mean_g = np.mean(estimates)
std_g = np.std(estimates)
grad_stds.append(std_g)
print(f'B={B:5d} | {mean_g:+.6f} | {std_g:.6f} | {(grad_stds[0]/std_g)**2:.1f}x')
# Verify variance scales as 1/B
ratio = grad_stds[0] / grad_stds[-1]
expected_ratio = np.sqrt(batch_sizes[-1] / batch_sizes[0])
ok = abs(ratio - expected_ratio) / expected_ratio < 0.15
print(f'\nPASS: {ok} — variance scales as 1/B (std ratio={ratio:.2f}, expected≈{expected_ratio:.2f})')
16. Notebook Summary
This notebook covered the complete theory and computation of partial derivatives and gradients, from first principles to modern ML applications:
Topics covered
| Section | Key result | Code demo |
|---|---|---|
| Scalar fields | Level sets, topology of loss landscapes | 3D surface plots |
| Partial derivatives | Limit definition, finite difference convergence | Error vs h log-log |
| Gradient | Standard forms, steepest ascent proof | Quiver fields |
| Directional derivatives | Maximised by ∇f direction | Polar angle sweep |
| ML gradients | MSE and CE gradient derivations | Logistic regression training |
| Gradient descent | Trajectory visualisation, multiple learning rates | 2D landscape |
| Numerical differentiation | Forward vs centred differences, optimal h | Error curves |
| Gradient checking | Relative error < 1e-5 for correct gradients | 5 test functions |
| Loss landscape | Beale function, multiple basins, GD trajectories | Non-convex plots |
| Gradient clipping | Norm-based clipping demo | Training stability |
| Natural gradient | FIM-preconditioned update for Gaussian MLE | Trajectory comparison |
| Softmax Jacobian | Diagonal - outer product formula | CE gradient verification |
| Hessian (preview) | Eigenstructure, curvature directions | Eigenvector overlay |
| Gradient flow | Continuous-time ODE vs discrete GD | Trajectory + loss curves |
| Sparse gradients | Embedding table update, LoRA parameterisation | Cost comparison |
| Wirtinger calculus | Complex gradient for real-valued losses | |
| Convergence theory | O(1/t) convex, linear strongly convex | κ vs steps plot |
| Adam vs GD | Adaptive preconditioning on ill-conditioned quadratic | Trajectory plot |
| SGD variance | 1/B scaling of mini-batch gradient noise | Batch size sweep |
Next steps
- Exercises: exercises.ipynb — 10 graded problems from partial derivatives to natural gradient
- Next section: §02 — Jacobians and Hessians extends these ideas to vector-valued functions and second-order curvature