Theory Notebook
Converted from
theory.ipynbfor web reading.
Derivatives and Differentiation
"The derivative is the instantaneous rate of change — and backpropagation is just the chain rule applied to a computation graph."
Interactive theory notebook covering: derivative definition, differentiation rules, chain rule, activation function derivatives, numerical differentiation, and optimization foundations.
Companion: notes.md | exercises.ipynb
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import numpy as np
import numpy.linalg as la
from scipy import integrate, special, stats
from math import factorial
import matplotlib.patches as patches
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
HAS_MPL = True
np.set_printoptions(precision=8, suppress=True)
np.random.seed(42)
def header(title):
print("\n" + "=" * len(title))
print(title)
print("=" * len(title))
def check_true(name, cond):
ok = bool(cond)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
return ok
def check_close(name, got, expected, tol=1e-8):
ok = np.allclose(got, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} - {name}: got {got}, expected {expected}")
return ok
def centered_diff(f, x, h=1e-6):
return (f(x + h) - f(x - h)) / (2 * h)
def forward_diff(f, x, h=1e-6):
return (f(x + h) - f(x)) / h
def backward_diff(f, x, h=1e-6):
return (f(x) - f(x - h)) / h
def grad_check(f, x, analytic_grad, h=1e-6):
x = np.asarray(x, dtype=float)
analytic_grad = np.asarray(analytic_grad, dtype=float)
numeric_grad = np.zeros_like(x, dtype=float)
for idx in np.ndindex(x.shape):
x_plus = x.copy(); x_minus = x.copy()
x_plus[idx] += h; x_minus[idx] -= h
numeric_grad[idx] = (f(x_plus) - f(x_minus)) / (2 * h)
denom = la.norm(analytic_grad) + la.norm(numeric_grad) + 1e-12
return la.norm(analytic_grad - numeric_grad) / denom
def check(name, got, expected, tol=1e-8):
return check_close(name, got, expected, tol=tol)
print("Chapter helper setup complete.")
1. Intuition — The Derivative as Instantaneous Rate of Change
The derivative measures how fast a function changes at a point. We visualize the secant-to-tangent limit and explore what the derivative means geometrically.
Code cell 5
# === 1.1 Secant Lines Converging to Tangent ===
def f(x):
return x**2
def f_prime(x):
return 2*x
a = 1.0 # Point of tangency
h_values = [1.0, 0.5, 0.2, 0.05]
x = np.linspace(-0.5, 2.5, 300)
if HAS_MPL:
colors = plt.cm.plasma(np.linspace(0.2, 0.9, len(h_values)))
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
ax = axes[0]
ax.plot(x, f(x), 'k-', lw=2, label='$f(x) = x^2$')
# Secant lines
for i, h in enumerate(h_values):
slope = (f(a+h) - f(a)) / h
x_line = np.linspace(a-0.3, a+h+0.3, 50)
y_line = f(a) + slope*(x_line - a)
ax.plot(x_line, y_line, '--', color=colors[i], lw=1.5,
label=f'h={h:.2f}, slope={slope:.2f}')
# Tangent
x_tang = np.linspace(a-0.8, a+0.8, 50)
ax.plot(x_tang, f(a) + f_prime(a)*(x_tang - a), 'r-', lw=2.5,
label=f"Tangent: slope={f_prime(a):.1f}")
ax.plot(a, f(a), 'ro', ms=8)
ax.set_xlim(-0.2, 2.2); ax.set_ylim(-0.2, 4.5)
ax.set_xlabel('x'); ax.set_ylabel('f(x)')
ax.set_title('Secant Lines → Tangent Line')
ax.legend(fontsize=9)
# Right panel: slope vs h
ax2 = axes[1]
h_fine = np.logspace(-4, 0, 200)
slopes = [(f(a+h) - f(a)) / h for h in h_fine]
ax2.semilogx(h_fine, slopes, 'b-', lw=2, label='Secant slope $(f(1+h)-f(1))/h$')
ax2.axhline(f_prime(a), color='r', ls='--', lw=2, label=f"True $f'(1)$ = {f_prime(a)}")
ax2.set_xlabel('Step size h'); ax2.set_ylabel('Slope')
ax2.set_title('Convergence of Secant to Tangent')
ax2.legend()
plt.tight_layout()
plt.savefig('/tmp/secant_tangent.png', dpi=100, bbox_inches='tight')
plt.show()
print('Figure saved.')
# Verify derivative
h = 1e-7
numerical = (f(1+h) - f(1)) / h
analytic = f_prime(1)
print(f'Numerical f\'(1): {numerical:.8f}')
print(f'Analytic f\'(1): {analytic:.8f}')
print(f'PASS - error < 1e-6: {abs(numerical - analytic) < 1e-6}')
2. Formal Definition — Derivative as a Limit
We compute this limit explicitly for several functions and verify symbolically.
Code cell 7
# === 2.1 Derivative from Definition — Worked Examples ===
# Example 1: f(x) = x^2, f'(x) = 2x
def limit_x2(x, h):
return ((x+h)**2 - x**2) / h
# Example 2: f(x) = sqrt(x), f'(x) = 1/(2*sqrt(x))
def limit_sqrt(x, h):
return (np.sqrt(x+h) - np.sqrt(x)) / h
# Example 3: f(x) = e^x, f'(x) = e^x
def limit_exp(x, h):
return (np.exp(x+h) - np.exp(x)) / h
x0 = 2.0
h_vals = [0.1, 0.01, 0.001, 1e-6]
print('=== Convergence of Difference Quotients ===')
print(f'\nf(x) = x^2 at x={x0} (true f\'={2*x0})')
for h in h_vals:
print(f' h={h:.0e}: {limit_x2(x0, h):.8f}')
print(f'\nf(x) = sqrt(x) at x={x0} (true f\'={1/(2*np.sqrt(x0)):.8f})')
for h in h_vals:
print(f' h={h:.0e}: {limit_sqrt(x0, h):.8f}')
print(f'\nf(x) = e^x at x={x0} (true f\'={np.exp(x0):.8f})')
for h in h_vals:
print(f' h={h:.0e}: {limit_exp(x0, h):.8f}')
Code cell 8
# === 2.2 Differentiable vs Continuous — ReLU and |x| ===
x = np.linspace(-2, 2, 400)
relu = np.maximum(0, x)
abs_x = np.abs(x)
# Difference quotients at x=0
h_vals = [0.5, 0.1, 0.01, 0.001]
print('=== Left and Right Derivatives at x=0 ===')
print('\nReLU:')
for h in h_vals:
left = (np.maximum(0, -h) - 0) / (-h)
right = (np.maximum(0, h) - 0) / h
print(f' h={h:.3f}: left={left:.3f}, right={right:.3f}')
print(' → right limit = 1, left limit = 0 → NOT differentiable (subgradient ∈ [0,1])')
print('\n|x|:')
for h in h_vals:
left = (abs(-h) - 0) / (-h)
right = (abs(h) - 0) / h
print(f' h={h:.3f}: left={left:.3f}, right={right:.3f}')
print(' → right limit = 1, left limit = -1 → NOT differentiable (corner)')
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
for ax, func, name in [(axes[0], relu, 'ReLU(x)'), (axes[1], abs_x, '|x|')]:
ax.plot(x, func, 'b-', lw=2, label=name)
ax.axvline(0, color='r', ls='--', alpha=0.5, label='x=0 (non-diff.)')
ax.set_xlabel('x'); ax.set_ylabel('f(x)')
ax.set_title(f'{name} — continuous but not differentiable at 0')
ax.legend()
plt.tight_layout(); plt.show()
3. Basic Differentiation Rules
The power, product, quotient, and sum rules let us differentiate any algebraic expression without computing limits. We verify each rule numerically.
Code cell 10
# === 3.1-3.5 Differentiation Rules — Numerical Verification ===
def centered_diff(f, x, h=1e-5):
return (f(x+h) - f(x-h)) / (2*h)
def check(name, analytic_fn, numerical_fn, x):
a = analytic_fn(x)
n = numerical_fn(x)
ok = abs(a - n) < 1e-6
print(f"{'PASS' if ok else 'FAIL'} {name}: analytic={a:.8f}, numerical={n:.8f}")
x = 1.5
print('=== Derivative Rules — Verification at x=1.5 ===')
# Power rule: (x^5)' = 5x^4
check('Power rule x^5', lambda x: 5*x**4, lambda x: centered_diff(lambda t: t**5, x), x)
# Product rule: (x^2 * sin(x))' = 2x*sin(x) + x^2*cos(x)
check('Product rule x^2*sin(x)',
lambda x: 2*x*np.sin(x) + x**2*np.cos(x),
lambda x: centered_diff(lambda t: t**2*np.sin(t), x), x)
# Quotient rule: (x^2/(x+1))' = (2x(x+1) - x^2)/(x+1)^2 = (x^2+2x)/(x+1)^2
check('Quotient rule x^2/(x+1)',
lambda x: (x**2 + 2*x) / (x+1)**2,
lambda x: centered_diff(lambda t: t**2/(t+1), x), x)
# Chain rule: sin(x^3)' = cos(x^3)*3x^2
check('Chain rule sin(x^3)',
lambda x: np.cos(x**3)*3*x**2,
lambda x: centered_diff(lambda t: np.sin(t**3), x), x)
# Logarithm: (ln(x))' = 1/x
check('Logarithm (ln x)',
lambda x: 1/x,
lambda x: centered_diff(np.log, x), x)
# Composite: e^(-x^2/2) - Gaussian kernel
check('Gaussian e^(-x^2/2)',
lambda x: -x*np.exp(-x**2/2),
lambda x: centered_diff(lambda t: np.exp(-t**2/2), x), x)
4. The Chain Rule
The chain rule is the mathematical foundation of backpropagation. Every node in a computation graph applies this rule to propagate gradients.
Code cell 12
# === 4.1-4.3 Chain Rule — Composition Analysis ===
def centered_diff(f, x, h=1e-6):
return (f(x+h) - f(x-h)) / (2*h)
x_vals = np.linspace(-2, 2, 200)
x = 1.0
# Example 1: y = e^(-x^2/2) — Gaussian
f1 = lambda t: np.exp(-t**2/2)
f1_prime_analytic = lambda t: -t * np.exp(-t**2/2)
err1 = abs(centered_diff(f1, x) - f1_prime_analytic(x))
print(f'Gaussian derivative error: {err1:.2e}')
# Example 2: y = sin(ln(x)) for x>0
f2 = lambda t: np.sin(np.log(t))
f2_prime_analytic = lambda t: np.cos(np.log(t)) / t
err2 = abs(centered_diff(f2, 2.0) - f2_prime_analytic(2.0))
print(f'sin(ln(x)) derivative error: {err2:.2e}')
# Example 3: y = (1+x^2)^10
f3 = lambda t: (1+t**2)**10
f3_prime_analytic = lambda t: 20*t*(1+t**2)**9
err3 = abs(centered_diff(f3, x) - f3_prime_analytic(x))
print(f'(1+x^2)^10 derivative error: {err3:.2e}')
print('\nAll chain rule verifications PASS' if max(err1,err2,err3) < 1e-5 else 'CHECK FAILED')
# Scalar backprop demo
print('\n=== Scalar Backpropagation Demo ===')
print('Network: L = (sigma(w*x+b) - y)^2 / 2')
w, b, x_in, y = 2.0, -1.0, 1.5, 0.8
# Forward pass
z = w*x_in + b
sigma_z = 1 / (1 + np.exp(-z))
L = 0.5 * (sigma_z - y)**2
# Backward pass
dL_da = sigma_z - y
da_dz = sigma_z * (1 - sigma_z)
dL_dz = dL_da * da_dz
dL_dw = dL_dz * x_in
dL_db = dL_dz * 1
# Numerical check
num_dw = (0.5*(1/(1+np.exp(-(w+1e-5)*x_in+b))-y)**2 - 0.5*(1/(1+np.exp(-(w-1e-5)*x_in+b))-y)**2) / 2e-5
print(f'dL/dw — analytic: {dL_dw:.8f}, numerical: {num_dw:.8f}')
print(f'PASS: {abs(dL_dw - num_dw) < 1e-5}')
5. Implicit Differentiation and Linear Approximation
When is defined implicitly by , differentiate both sides w.r.t. treating . Linear approximation is the derivative's most direct application.
Code cell 14
# === 5.1-5.3 Implicit Differentiation and Linear Approximation ===
import numpy as np
# --- Implicit: circle x^2 + y^2 = r^2 ---
# dy/dx = -x/y
r = 3.0
theta = np.linspace(0, 2*np.pi, 400)
x_circ = r * np.cos(theta)
y_circ = r * np.sin(theta)
# At point (x0, y0) on upper semicircle
x0 = 2.0
y0 = np.sqrt(r**2 - x0**2)
slope_implicit = -x0 / y0
# Verify numerically: y = sqrt(r^2 - x^2), dy/dx = -x/sqrt(r^2-x^2)
slope_numerical = (-x0 / np.sqrt(r**2 - x0**2))
print(f'Circle implicit slope at ({x0:.1f}, {y0:.3f}): {slope_implicit:.6f}')
print(f'Numerical check: {slope_numerical:.6f}')
print(f'PASS: {abs(slope_implicit - slope_numerical) < 1e-10}')
# --- Linear approximation demo ---
print('\n=== Linear Approximation: sin(x) near x=pi/4 ===')
a = np.pi / 4
f_a = np.sin(a)
f_prime_a = np.cos(a)
x_test_vals = [a + dx for dx in [-0.3, -0.1, 0.1, 0.3]]
print(f'Linearization: f(x) ≈ {f_a:.4f} + {f_prime_a:.4f}*(x - {a:.4f})')
print(f'{"x":>8} {"True f(x)":>12} {"Linear approx":>14} {"Error":>10}')
for x in x_test_vals:
true_val = np.sin(x)
linear_val = f_a + f_prime_a*(x - a)
error = abs(true_val - linear_val)
print(f'{x:8.4f} {true_val:12.6f} {linear_val:14.6f} {error:10.2e}')
# Cross-entropy gradient via related rates
print('\n=== Cross-Entropy Gradient via Chain Rule ===')
z = 1.5
sigma_z = 1/(1+np.exp(-z))
# L = -log(sigma(z)), dL/dz = sigma(z) - 1
dL_dz_analytic = sigma_z - 1
h = 1e-6
L_plus = -np.log(1/(1+np.exp(-(z+h))))
L_minus = -np.log(1/(1+np.exp(-(z-h))))
dL_dz_numerical = (L_plus - L_minus) / (2*h)
print(f'dL/dz = sigma(z)-1: analytic={dL_dz_analytic:.8f}, numerical={dL_dz_numerical:.8f}')
print(f'PASS: {abs(dL_dz_analytic - dL_dz_numerical) < 1e-7}')
6. Higher-Order Derivatives and Concavity
means concave up; means concave down. The second derivative determines local curvature of the loss surface.
Code cell 16
# === 6.1-6.4 Higher-Order Derivatives ===
try:
import matplotlib.pyplot as plt
HAS_MPL = True
except ImportError:
HAS_MPL = False
x = np.linspace(-3, 3, 500)
# f = x^4 - 4x^2 (W-shape — two minima)
f_vals = x**4 - 4*x**2
fp_vals = 4*x**3 - 8*x
fpp_vals = 12*x**2 - 8
# Critical points: f' = 0 => 4x^3 - 8x = 4x(x^2-2) = 0 => x in {0, ±sqrt(2)}
crit = [0, np.sqrt(2), -np.sqrt(2)]
print('=== Critical Points of f(x) = x^4 - 4x^2 ===')
for c in crit:
fpc = 4*c**3 - 8*c
fppc = 12*c**2 - 8
kind = 'local min' if fppc > 0 else ('local max' if fppc < 0 else 'inconclusive')
print(f" x={c:+.4f}: f={c**4-4*c**2:.4f}, f_prime={fpc:.4f}, f_double_prime={fppc:.4f} -> {kind}")
# Inflection points: f'' = 0 => 12x^2 = 8 => x = ±sqrt(2/3)
infl = np.sqrt(2/3)
print(f'\nInflection points at x = ±{infl:.4f}')
if HAS_MPL:
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
for ax, vals, name, color in zip(axes,
[f_vals, fp_vals, fpp_vals],
['f(x) = x⁴ - 4x²', "f'(x)", "f''(x)"],
['blue', 'green', 'red']):
ax.plot(x, vals, color=color, lw=2, label=name)
ax.axhline(0, color='k', lw=0.8)
ax.set_xlim(-3, 3); ax.set_ylim(-5, 10)
ax.set_xlabel('x'); ax.set_title(name)
ax.legend()
plt.suptitle('Function, First, and Second Derivative', fontsize=13)
plt.tight_layout(); plt.show()
7. Extrema, Critical Points, and the Mean Value Theorem
The MVT guarantees a point where the instantaneous rate equals the average rate. This underlies Lipschitz bounds and gradient descent convergence theory.
Code cell 18
# === 7.1-7.4 Extrema and MVT ===
# --- Global extrema on closed interval ---
def f(x):
return x**3 - 3*x
def fp(x):
return 3*x**2 - 3
a, b = -2.0, 2.5
# Critical points in (a,b): f'(x) = 0 => x = ±1
crit = [-1.0, 1.0]
candidates = [a] + [c for c in crit if a < c < b] + [b]
print('=== Global Extrema of f(x) = x^3 - 3x on [-2, 2.5] ===')
print(f'{"Point":>8} {"f(x)":>10} {"Type":>15}')
f_vals = [(x, f(x)) for x in candidates]
min_x = min(f_vals, key=lambda p: p[1])
max_x = max(f_vals, key=lambda p: p[1])
for pt, val in f_vals:
kind = 'Global MIN' if pt == min_x[0] else ('Global MAX' if pt == max_x[0] else 'Candidate')
print(f'{pt:8.3f} {val:10.4f} {kind:>15}')
# --- MVT illustration ---
print('\n=== Mean Value Theorem ===')
x_mvt = np.linspace(a, b, 1000)
avg_slope = (f(b) - f(a)) / (b - a)
print(f'Average slope [f({b})-f({a})] / ({b}-{a}) = {avg_slope:.4f}')
# Find c where f'(c) = avg_slope: 3c^2 - 3 = avg_slope => c = sqrt((avg_slope+3)/3)
c_mvt = np.sqrt((avg_slope + 3) / 3)
print(f'MVT guarantees c in ({a},{b}) where f\'(c) = {avg_slope:.4f}')
print(f'Found c = {c_mvt:.4f}, f\'({c_mvt:.4f}) = {fp(c_mvt):.4f}')
print(f'PASS — MVT c in interval: {a < c_mvt < b}')
8. Activation Function Derivatives
Activation functions and their derivatives control gradient flow in neural networks. The vanishing gradient problem arises from small values compounding through layers.
Code cell 20
# === 8.1-8.5 Activation Derivatives ===
try:
import matplotlib.pyplot as plt
HAS_MPL = True
except:
HAS_MPL = False
x = np.linspace(-5, 5, 500)
# Sigmoid
def sigmoid(x): return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
def sigmoid_prime(x): s = sigmoid(x); return s * (1 - s)
# Tanh
def tanh_prime(x): return 1 - np.tanh(x)**2
# ReLU
def relu(x): return np.maximum(0, x)
def relu_prime(x): return np.where(np.asarray(x) > 0, 1.0, 0.0)
# GELU
from scipy.special import erf
def gelu(x): return x * 0.5 * (1 + erf(x / np.sqrt(2)))
def gelu_prime(x):
Phi = 0.5 * (1 + erf(x / np.sqrt(2)))
phi = np.exp(-x**2/2) / np.sqrt(2*np.pi)
return Phi + x * phi
# Numerical verification of derivatives at x=1.5
x0 = 1.5
h = 1e-6
def num_deriv(f, x): return (f(x+h) - f(x-h)) / (2*h)
print('=== Activation Derivative Verification at x=1.5 ===')
tests = [
('Sigmoid', sigmoid, sigmoid_prime),
('Tanh', np.tanh, tanh_prime),
('ReLU', relu, relu_prime),
('GELU', gelu, gelu_prime),
]
for name, fn, fn_prime in tests:
a = fn_prime(x0)
n = num_deriv(fn, x0)
ok = abs(a - n) < 1e-5
print(f" {'PASS' if ok else 'FAIL'} {name}: analytic={a:.6f}, numerical={n:.6f}")
# Vanishing gradient analysis
print('\n=== Vanishing Gradient — Sigmoid Product Through L Layers ===')
z_typical = 2.0 # typical saturated input
sig_prime_val = sigmoid_prime(z_typical)
print(f'sigma\'({z_typical}) = {sig_prime_val:.6f}')
for L in [5, 10, 20, 50]:
product = sig_prime_val**L
print(f' {L} layers: (sigma\')^{L} = {product:.2e}')
Code cell 21
# === 8.5 Activation Derivatives — Visualization ===
try:
import matplotlib.pyplot as plt
from scipy.special import erf
HAS_MPL = True
except:
HAS_MPL = False
import numpy as np
def sigmoid(x): return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
def sigmoid_prime(x): s = sigmoid(x); return s * (1 - s)
def relu_prime(x): return np.where(np.asarray(x) > 0, 1.0, 0.0)
def gelu(x): return x * 0.5 * (1 + erf(x / np.sqrt(2)))
def gelu_prime(x):
Phi = 0.5 * (1 + erf(x / np.sqrt(2)))
phi = np.exp(-x**2/2) / np.sqrt(2*np.pi)
return Phi + x * phi
x = np.linspace(-4, 4, 400)
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Left: activation functions
ax = axes[0]
ax.plot(x, sigmoid(x), label='Sigmoid', lw=2)
ax.plot(x, np.tanh(x), label='Tanh', lw=2)
ax.plot(x, np.maximum(0, x), label='ReLU', lw=2)
ax.plot(x, gelu(x), label='GELU', lw=2, ls='--')
ax.set_xlabel('x'); ax.set_ylabel('f(x)')
ax.set_title('Activation Functions')
ax.legend(); ax.set_ylim(-1.5, 4)
# Right: derivatives
ax2 = axes[1]
ax2.plot(x, sigmoid_prime(x), label="Sigmoid'", lw=2)
ax2.plot(x, 1 - np.tanh(x)**2, label="Tanh'", lw=2)
ax2.plot(x, relu_prime(x), label="ReLU'", lw=2)
ax2.plot(x, gelu_prime(x), label="GELU'", lw=2, ls='--')
ax2.axhline(0.25, color='gray', ls=':', lw=1, label='1/4 (sigmoid max)')
ax2.set_xlabel('x'); ax2.set_ylabel("f'(x)")
ax2.set_title('Activation Derivatives')
ax2.legend(); ax2.set_ylim(-0.2, 1.2)
plt.tight_layout(); plt.show()
print('Activation derivatives plotted.')
print('Peak gradient — Sigmoid:', f"{max(sigmoid_prime(x)):.4f} at x=0")
print('Peak gradient — Tanh: ', f"{max(1-np.tanh(x)**2):.4f} at x=0")
print('Constant grad — ReLU: 1.0 for x>0 (no saturation)')
9. Numerical Differentiation
Finite differences approximate derivatives numerically. Two errors compete: truncation (decreases with ) and round-off (increases as ). The centered difference is strongly preferred over the forward difference .
Code cell 23
# === 9.1-9.2 Finite Differences — Error Analysis ===
import numpy as np
try:
import matplotlib.pyplot as plt
HAS_MPL = True
except:
HAS_MPL = False
def f(x): return np.sin(x)
def f_prime(x): return np.cos(x)
x0 = 1.0
true_deriv = f_prime(x0)
h_vals = np.logspace(-16, 0, 160)
forward_errors = []
centered_errors = []
for h in h_vals:
fwd = (f(x0+h) - f(x0)) / h
ctr = (f(x0+h) - f(x0-h)) / (2*h)
forward_errors.append(abs(fwd - true_deriv))
centered_errors.append(abs(ctr - true_deriv))
if HAS_MPL:
fig, ax = plt.subplots(figsize=(10, 6))
ax.loglog(h_vals, forward_errors, 'b-', lw=2, label='Forward diff O(h)')
ax.loglog(h_vals, centered_errors, 'r-', lw=2, label='Centered diff O(h²)')
# Reference slopes
h_ref = np.array([1e-8, 1e-4])
ax.loglog(h_ref, h_ref, 'b--', alpha=0.5, label='O(h) slope')
ax.loglog(h_ref, h_ref**2 * 1e4, 'r--', alpha=0.5, label='O(h²) slope')
ax.set_xlabel('Step size h')
ax.set_ylabel('|Error|')
ax.set_title('Finite Difference Error vs Step Size (sin(x) at x=1)')
ax.legend()
plt.tight_layout(); plt.show()
# Find optimal h
opt_fwd_idx = np.argmin(forward_errors)
opt_ctr_idx = np.argmin(centered_errors)
print(f'Optimal h for forward diff: {h_vals[opt_fwd_idx]:.2e} (error: {forward_errors[opt_fwd_idx]:.2e})')
print(f'Optimal h for centered diff: {h_vals[opt_ctr_idx]:.2e} (error: {centered_errors[opt_ctr_idx]:.2e})')
print(f'Theoretical optimal centered: ~6e-6 (eps_mach^(1/3))')
Code cell 24
# === 9.3 Gradient Checking — Verifying Analytical Gradients ===
import numpy as np
def grad_check_scalar(f, x, analytic_grad, h=1e-5):
"""Check gradient of scalar-to-scalar function f."""
numerical = (f(x+h) - f(x-h)) / (2*h)
rel_error = abs(analytic_grad - numerical) / (abs(analytic_grad) + abs(numerical) + 1e-10)
return rel_error, numerical
def grad_check_vector(f, x, analytic_grad, h=1e-5):
"""Check gradient of vector-to-scalar function f."""
n = len(x)
numerical_grad = np.zeros(n)
for i in range(n):
x_plus = x.copy(); x_plus[i] += h
x_minus = x.copy(); x_minus[i] -= h
numerical_grad[i] = (f(x_plus) - f(x_minus)) / (2*h)
rel_error = np.linalg.norm(analytic_grad - numerical_grad) / \
(np.linalg.norm(analytic_grad) + np.linalg.norm(numerical_grad) + 1e-10)
return rel_error, numerical_grad
print('=== Gradient Checking Examples ===')
# Example 1: f(x) = x*ln(x+e^(-x))
def f1(x): return x * np.log(x + np.exp(-x))
def f1_prime(x): return np.log(x + np.exp(-x)) + x * (1 - np.exp(-x)) / (x + np.exp(-x))
x_test = 1.5
err1, num1 = grad_check_scalar(f1, x_test, f1_prime(x_test))
print(f'f(x) = x*ln(x+e^-x): analytic={f1_prime(x_test):.6f}, numerical={num1:.6f}')
print(f' Relative error: {err1:.2e} → {"PASS" if err1 < 1e-5 else "FAIL"}')
# Example 2: L2 loss gradient
def loss(w):
x = np.array([1.0, 2.0, -1.0])
y = 1.5
return 0.5 * (np.dot(w, x) - y)**2
def loss_grad(w):
x = np.array([1.0, 2.0, -1.0])
y = 1.5
return (np.dot(w, x) - y) * x
w = np.array([0.5, -0.3, 1.2])
err2, num_grad = grad_check_vector(loss, w, loss_grad(w))
print(f'\nL2 loss gradient:')
print(f' Analytic: {loss_grad(w)}')
print(f' Numerical: {num_grad}')
print(f' Relative error: {err2:.2e} → {"PASS" if err2 < 1e-5 else "FAIL"}')
10. Key Proofs — Interactive Verification
We verify the main derivative proofs computationally, checking that the theoretical formulas match numerical estimates to machine precision.
Code cell 26
# === 10. Proof Verification ===
import numpy as np
def centered(f, x, h=1e-7):
return (f(x+h) - f(x-h)) / (2*h)
print('=== Verifying All Main Derivative Rules ===')
tests = [
# (name, analytic_fn, numerical_fn, x)
('Power x^7', lambda x: 7*x**6, lambda x: centered(lambda t: t**7, x), 1.5),
('sin(x)', np.cos, lambda x: centered(np.sin, x), 1.2),
('cos(x)', lambda x: -np.sin(x), lambda x: centered(np.cos, x), 0.7),
('e^x', np.exp, lambda x: centered(np.exp, x), 2.0),
('ln(x)', lambda x: 1/x, lambda x: centered(np.log, x), 3.0),
('arctan(x)', lambda x: 1/(1+x**2), lambda x: centered(np.arctan, x), 1.0),
('arcsin(x)', lambda x: 1/np.sqrt(1-x**2), lambda x: centered(np.arcsin, x), 0.5),
('x^x (log diff)', lambda x: x**x*(np.log(x)+1), lambda x: centered(lambda t: t**t, x), 2.0),
('x^2*e^x (product)', lambda x: x**2*np.exp(x) + 2*x*np.exp(x),
lambda x: centered(lambda t: t**2*np.exp(t), x), 1.5),
('sin(e^x) (chain)', lambda x: np.cos(np.exp(x))*np.exp(x),
lambda x: centered(lambda t: np.sin(np.exp(t)), x), 0.5),
('Softplus derivative=sigma', lambda x: 1/(1+np.exp(-x)),
lambda x: centered(lambda t: np.log(1+np.exp(t)), x), 1.0),
]
all_pass = True
for name, analytic, numerical, x in tests:
a = analytic(x); n = numerical(x)
err = abs(a - n) / (abs(a) + 1e-10)
ok = err < 1e-5
all_pass = all_pass and ok
print(f" {'PASS' if ok else 'FAIL'} {name}: err={err:.2e}")
print(f'\nAll tests: {"PASS" if all_pass else "SOME FAILED"}')
11. Application — Adam Optimizer Analysis
The Adam optimizer uses first and second derivative moments to adapt learning rates. We simulate Adam on a quadratic and visualize convergence compared to vanilla SGD.
Code cell 28
# === 11. Adam Optimizer — Single Variable Simulation ===
import numpy as np
try:
import matplotlib.pyplot as plt
HAS_MPL = True
except:
HAS_MPL = False
# Loss: f(theta) = (theta - 3)^2 + 0.1*sin(5*theta)
def loss(theta): return (theta - 3)**2 + 0.1*np.sin(5*theta)
def grad(theta): return 2*(theta - 3) + 0.5*np.cos(5*theta)
def run_sgd(theta0, lr=0.1, steps=200):
theta = theta0; trajectory = [theta0]
for _ in range(steps):
theta -= lr * grad(theta)
trajectory.append(theta)
return np.array(trajectory)
def run_adam(theta0, lr=0.1, b1=0.9, b2=0.999, eps=1e-8, steps=200):
theta = theta0; m = 0.0; v = 0.0
trajectory = [theta0]
for t in range(1, steps+1):
g = grad(theta)
m = b1*m + (1-b1)*g
v = b2*v + (1-b2)*g**2
m_hat = m / (1 - b1**t)
v_hat = v / (1 - b2**t)
theta -= lr * m_hat / (np.sqrt(v_hat) + eps)
trajectory.append(theta)
return np.array(trajectory)
theta0 = -2.0
sgd_traj = run_sgd(theta0, lr=0.1)
adam_traj = run_adam(theta0, lr=0.5)
print('=== Optimizer Comparison ===')
print(f'True minimum near theta=3 (ignoring sin perturbation)')
print(f'SGD final theta: {sgd_traj[-1]:.6f}, loss: {loss(sgd_traj[-1]):.6f}')
print(f'Adam final theta: {adam_traj[-1]:.6f}, loss: {loss(adam_traj[-1]):.6f}')
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
steps = np.arange(len(sgd_traj))
axes[0].plot(steps, loss(sgd_traj), 'b-', lw=2, label='SGD')
axes[0].plot(steps, loss(adam_traj), 'r-', lw=2, label='Adam')
axes[0].set_xlabel('Step'); axes[0].set_ylabel('Loss')
axes[0].set_title('Loss vs Training Step')
axes[0].legend(); axes[0].set_yscale('log')
theta_plot = np.linspace(-3, 5, 300)
axes[1].plot(theta_plot, loss(theta_plot), 'k-', lw=2, label='Loss surface')
axes[1].plot(sgd_traj[:50], loss(sgd_traj[:50]), 'b.-', ms=4, alpha=0.7, label='SGD path')
axes[1].plot(adam_traj[:50], loss(adam_traj[:50]), 'r.-', ms=4, alpha=0.7, label='Adam path')
axes[1].set_xlabel('theta'); axes[1].set_ylabel('Loss')
axes[1].set_title('Optimizer Trajectories on Loss Surface (first 50 steps)')
axes[1].legend()
plt.tight_layout(); plt.show()
12. Backpropagation — Chain Rule on Computation Graphs
We implement a complete 2-layer network forward/backward pass, verify gradients against numerical estimates, and visualize gradient flow through layers.
Code cell 30
# === 12. Full 2-Layer Network Backprop ===
import numpy as np
def sigmoid(x): return 1/(1+np.exp(-x))
def sigmoid_prime(x): s = sigmoid(x); return s*(1-s)
class TwoLayerNet:
def __init__(self, w1, b1, w2, b2):
self.w1 = w1; self.b1 = b1
self.w2 = w2; self.b2 = b2
def forward(self, x, y):
self.x = x
self.z1 = self.w1*x + self.b1
self.a1 = sigmoid(self.z1)
self.z2 = self.w2*self.a1 + self.b2
self.a2 = sigmoid(self.z2)
self.loss = 0.5*(self.a2 - y)**2
return self.loss
def backward(self):
# Layer 2
dL_da2 = self.a2 - self.y if hasattr(self,'y') else None
# Recompute with stored y
return None # see below
# Manual backprop
def backprop(x, y, w1, b1, w2, b2):
# Forward
z1 = w1*x + b1
a1 = sigmoid(z1)
z2 = w2*a1 + b2
a2 = sigmoid(z2)
L = 0.5*(a2 - y)**2
# Backward
dL_da2 = a2 - y
da2_dz2 = sigmoid_prime(z2)
dL_dz2 = dL_da2 * da2_dz2
dL_dw2 = dL_dz2 * a1
dL_db2 = dL_dz2
dL_da1 = dL_dz2 * w2
da1_dz1 = sigmoid_prime(z1)
dL_dz1 = dL_da1 * da1_dz1
dL_dw1 = dL_dz1 * x
dL_db1 = dL_dz1
return L, dict(w1=dL_dw1, b1=dL_db1, w2=dL_dw2, b2=dL_db2)
# Test values
x, y, w1, b1, w2, b2 = 1.5, 0.7, 2.0, -0.5, 1.2, 0.3
L, grads = backprop(x, y, w1, b1, w2, b2)
print('=== 2-Layer Network Gradient Check ===')
print(f'Loss: {L:.6f}')
# Numerical check via parameter perturbation
h = 1e-5
for param_name, param_val, idx in [('w1',w1,0),('b1',b1,1),('w2',w2,2),('b2',b2,3)]:
params = [w1, b1, w2, b2]
p_plus = params[:]; p_plus[idx] += h
p_minus = params[:]; p_minus[idx] -= h
L_p = backprop(x, y, *p_plus)[0]
L_m = backprop(x, y, *p_minus)[0]
numerical = (L_p - L_m) / (2*h)
analytic = grads[param_name]
ok = abs(analytic - numerical) < 1e-6
print(f" {'PASS' if ok else 'FAIL'} d{param_name}: analytic={analytic:.8f}, "
f"numerical={numerical:.8f}")
13. Softmax and Cross-Entropy Gradient
The gradient of cross-entropy loss with respect to softmax logits has a beautifully simple form: .
Code cell 32
# === 13. Softmax + Cross-Entropy Gradient ===
import numpy as np
def softmax(z):
z_stable = z - np.max(z) # numerical stability
e = np.exp(z_stable)
return e / e.sum()
def cross_entropy(p, y):
return -np.sum(y * np.log(p + 1e-12))
def softmax_ce_grad(z, y):
"""Gradient of CE loss w.r.t. logits z — should equal p - y."""
p = softmax(z)
return p - y
# Test
K = 5
np.random.seed(42)
z = np.random.randn(K)
y = np.zeros(K); y[2] = 1.0 # true class = 2
p = softmax(z)
L = cross_entropy(p, y)
analytic_grad = softmax_ce_grad(z, y)
# Numerical gradient
h = 1e-5
numerical_grad = np.zeros(K)
for i in range(K):
z_p = z.copy(); z_p[i] += h
z_m = z.copy(); z_m[i] -= h
numerical_grad[i] = (cross_entropy(softmax(z_p), y) - cross_entropy(softmax(z_m), y)) / (2*h)
print('=== Softmax + Cross-Entropy Gradient ===')
print(f'Logits z: {z}')
print(f'Probs p: {p}')
print(f'Loss L: {L:.6f}')
print(f'\nGradient comparison (should be p - y):')
print(f' p - y: {p - y}')
print(f' Analytic: {analytic_grad}')
print(f' Numerical: {numerical_grad}')
err = np.linalg.norm(analytic_grad - numerical_grad)
print(f'\nPASS - gradient error: {err:.2e}' if err < 1e-5 else f'FAIL - error: {err:.2e}')
14. Vanishing Gradients — Quantitative Analysis
We simulate gradient propagation through sigmoid layers and observe exponential decay. This motivates ReLU and residual connections in deep networks.
Code cell 34
# === 14. Vanishing Gradient Simulation ===
import numpy as np
try:
import matplotlib.pyplot as plt
HAS_MPL = True
except:
HAS_MPL = False
def sigmoid(x): return 1/(1+np.exp(-x))
def sigmoid_prime(x): s = sigmoid(x); return s*(1-s)
def relu_prime(x): return float(x > 0)
np.random.seed(42)
max_layers = 50
# Simulate gradient magnitude at layer 1 as function of depth L
sigmoid_grads = []
relu_grads = []
for L in range(1, max_layers+1):
# Random pre-activations (sigmoid tends to saturate)
z_sigmoid = np.random.randn(L) * 2 # some saturation
z_relu = np.abs(np.random.randn(L)) # mostly positive
grad_sig = np.prod([sigmoid_prime(z) for z in z_sigmoid])
grad_relu = np.prod([relu_prime(z) for z in z_relu])
sigmoid_grads.append(abs(grad_sig))
relu_grads.append(abs(grad_relu))
layers = np.arange(1, max_layers+1)
print('=== Gradient Magnitude at Layer 1 vs Network Depth ===')
for L in [1, 5, 10, 20, 50]:
print(f' L={L:2d}: sigmoid={sigmoid_grads[L-1]:.2e}, relu={relu_grads[L-1]:.2e}')
if HAS_MPL:
fig, ax = plt.subplots(figsize=(10, 6))
ax.semilogy(layers, sigmoid_grads, 'b-', lw=2, label='Sigmoid activations')
ax.semilogy(layers, relu_grads, 'r-', lw=2, label='ReLU activations')
ax.axhline(1e-7, color='gray', ls=':', label='Float32 precision floor')
ax.set_xlabel('Number of layers L')
ax.set_ylabel('|Gradient at layer 1|')
ax.set_title('Vanishing Gradient: Sigmoid vs ReLU')
ax.legend()
plt.tight_layout(); plt.show()
print('\nConclusion: sigmoid gradients vanish exponentially;')
print('ReLU maintains O(1) gradient for active units.')
15. Numerically Stable Derivative Computations
Floating-point arithmetic introduces subtle errors. We compare naive vs. stable implementations for sigmoid, log-sigmoid, and softmax.
Code cell 36
# === 15. Numerical Stability ===
import numpy as np
# Naive sigmoid — overflows for large negative x
def sigmoid_naive(x): return 1 / (1 + np.exp(-x))
# Stable sigmoid
def sigmoid_stable(x):
return np.where(x >= 0,
1 / (1 + np.exp(-x)),
np.exp(x) / (1 + np.exp(x)))
# Naive log-sigmoid
def log_sigmoid_naive(x): return np.log(sigmoid_naive(x))
# Stable log-sigmoid
def log_sigmoid_stable(x): return -np.logaddexp(0, -x)
test_vals = [-1000, -100, -10, 0, 10, 100, 1000]
print('=== Sigmoid Stability ===')
print(f'{"x":>8} {"Naive":>15} {"Stable":>15} {"True":>15}')
for x in test_vals:
naive = sigmoid_naive(np.float64(x))
stable = sigmoid_stable(np.float64(x))
true_val = 1/(1+np.exp(-np.float64(x))) if abs(x) < 700 else (1.0 if x > 0 else 0.0)
print(f'{x:8.0f} {naive:15.8f} {stable:15.8f} {true_val:15.8f}')
print('\n=== Log-Sigmoid Stability ===')
print(f'{"x":>8} {"Naive":>15} {"Stable":>15}')
for x in [-100, -10, 0, 10, 100]:
with np.errstate(divide='ignore', invalid='ignore'):
naive = log_sigmoid_naive(np.float64(x))
stable = log_sigmoid_stable(np.float64(x))
print(f'{x:8.0f} {naive:15.8f} {stable:15.8f}')
print('\nNaive gives -inf or 0 for large |x|; stable is correct throughout.')
16. Logarithmic Differentiation
When the function is a product/quotient of many terms, taking first and differentiating implicitly is often much simpler.
Code cell 38
# === 16. Logarithmic Differentiation ===
import numpy as np
def centered(f, x, h=1e-6): return (f(x+h) - f(x-h)) / (2*h)
# f(x) = x^x
# ln f = x ln x => f'/f = ln x + 1 => f' = x^x * (ln x + 1)
def f_xx(x): return x**x
def f_xx_prime(x): return x**x * (np.log(x) + 1)
x = 2.0
print(f'x^x derivative at x=2:')
print(f' Analytic: {f_xx_prime(x):.8f}')
print(f' Numerical: {centered(f_xx, x):.8f}')
print(f' PASS: {abs(f_xx_prime(x) - centered(f_xx, x)) < 1e-5}')
# f(x) = (sin x)^x for x > 0
# ln f = x ln(sin x) => f'/f = ln(sin x) + x*cos(x)/sin(x)
def f_sinx_x(x): return np.sin(x)**x
def f_sinx_x_prime(x):
return np.sin(x)**x * (np.log(np.sin(x)) + x*np.cos(x)/np.sin(x))
x = 1.5
print(f'\n(sin x)^x derivative at x=1.5:')
print(f' Analytic: {f_sinx_x_prime(x):.8f}')
print(f' Numerical: {centered(f_sinx_x, x):.8f}')
print(f' PASS: {abs(f_sinx_x_prime(x) - centered(f_sinx_x, x)) < 1e-5}')
# Demonstrate for multi-factor expression
# g(x) = (x^3 * sqrt(x+1)) / (x^2+2)^4
def g(x): return (x**3 * np.sqrt(x+1)) / (x**2+2)**4
def g_prime(x):
# ln g = 3 ln x + 0.5 ln(x+1) - 4 ln(x^2+2)
# g'/g = 3/x + 1/(2(x+1)) - 8x/(x^2+2)
return g(x) * (3/x + 1/(2*(x+1)) - 8*x/(x**2+2))
x = 1.5
print(f'\nMulti-factor g(x) at x=1.5:')
print(f' Analytic: {g_prime(x):.8f}')
print(f' Numerical: {centered(g, x):.8f}')
print(f' PASS: {abs(g_prime(x) - centered(g, x)) < 1e-5}')
17. Inverse Function Theorem
If , then is differentiable at with . We use this to derive all inverse trig derivatives.
Code cell 40
# === 17. Inverse Function Theorem ===
import numpy as np
def centered(f, x, h=1e-6): return (f(x+h) - f(x-h)) / (2*h)
# arcsin: (arcsin x)' = 1/sqrt(1-x^2)
def arcsin_prime(x): return 1/np.sqrt(1-x**2)
# arctan: (arctan x)' = 1/(1+x^2)
def arctan_prime(x): return 1/(1+x**2)
test_pts = [0.0, 0.3, -0.5]
print('=== Inverse Trig Derivatives ===')
for x in test_pts:
a_asin = arcsin_prime(x); n_asin = centered(np.arcsin, x)
a_atan = arctan_prime(x); n_atan = centered(np.arctan, x)
print(f'x={x:+.1f}: arcsin\' analytic={a_asin:.6f}, numerical={n_asin:.6f} '
f'| arctan\' analytic={a_atan:.6f}, numerical={n_atan:.6f}')
print('\nAll PASS:', all(
abs(arcsin_prime(x) - centered(np.arcsin, x)) < 1e-5 and
abs(arctan_prime(x) - centered(np.arctan, x)) < 1e-5
for x in test_pts
))
# IFT: verify (f^-1)'(f(x)) = 1/f'(x) for f = exp
print('\nIFT check: (ln)\' at y = e^x = f(x), should equal 1/f\'(x) = 1/e^x')
for x in [0.5, 1.0, 2.0]:
y = np.exp(x)
inv_prime_at_y = 1/y # (ln)' at y
ift_formula = 1/np.exp(x) # 1/f'(x)
print(f' x={x}: (ln)\'(e^x)={inv_prime_at_y:.6f}, 1/e^x={ift_formula:.6f}')
18. Mean Value Theorem — Visualization
We visualize the MVT: for on , there exists where the tangent is parallel to the secant. Applications: monotonicity, Lipschitz bounds.
Code cell 42
# === 18. MVT Visualization ===
import numpy as np
try:
import matplotlib.pyplot as plt
HAS_MPL = True
except:
HAS_MPL = False
def f(x): return np.sin(x) + 0.3*x
def fp(x): return np.cos(x) + 0.3
a, b = 0.5, 4.0
secant_slope = (f(b) - f(a)) / (b - a)
# Find c where f'(c) = secant_slope numerically
x_scan = np.linspace(a, b, 10000)
fp_vals = fp(x_scan)
# c where fp closest to secant_slope
c_idx = np.argmin(abs(fp_vals - secant_slope))
c_mvt = x_scan[c_idx]
print(f'f(x) = sin(x) + 0.3x on [{a}, {b}]')
print(f'Secant slope: {secant_slope:.4f}')
print(f'MVT point c = {c_mvt:.4f}, f\'(c) = {fp(c_mvt):.4f}')
if HAS_MPL:
x_plot = np.linspace(0, 4.5, 300)
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(x_plot, f(x_plot), 'b-', lw=2, label='f(x)')
# Secant line
x_sec = np.linspace(a-0.2, b+0.2, 50)
y_sec = f(a) + secant_slope*(x_sec - a)
ax.plot(x_sec, y_sec, 'r-', lw=2, label=f'Secant (slope={secant_slope:.2f})')
# Tangent at c
x_tang = np.linspace(c_mvt-0.8, c_mvt+0.8, 50)
y_tang = f(c_mvt) + fp(c_mvt)*(x_tang - c_mvt)
ax.plot(x_tang, y_tang, 'g--', lw=2, label=f'Tangent at c={c_mvt:.2f}')
ax.plot([a, b], [f(a), f(b)], 'ro', ms=8)
ax.plot(c_mvt, f(c_mvt), 'g^', ms=10, label='MVT point c')
ax.set_xlabel('x'); ax.set_ylabel('f(x)')
ax.set_title('Mean Value Theorem')
ax.legend()
plt.tight_layout(); plt.show()
19. Derivatives and Taylor Approximation
The first-order Taylor approximation is the derivative's most direct practical application. We verify approximation quality across multiple functions.
Code cell 44
# === 19. Linear (First-Order Taylor) Approximation ===
import numpy as np
try:
import matplotlib.pyplot as plt
HAS_MPL = True
except:
HAS_MPL = False
def linear_approx(f, fp, a, x):
return f(a) + fp(a)*(x - a)
functions = [
('sin(x)', np.sin, np.cos, np.pi/4),
('e^x', np.exp, np.exp, 0.0),
('ln(x)', np.log, lambda x: 1/x, 1.0),
('sqrt(x)', np.sqrt, lambda x: 1/(2*np.sqrt(x)), 4.0),
]
print('=== Linear Approximation Quality ===')
print(f'{"Function":>12} {"a":>6} {"x-a":>6} {"True f(x)":>12} {"Linear":>12} {"Error":>10}')
for name, f, fp, a in functions:
for dx in [0.1, 0.5, 1.0]:
x = a + dx
true_val = f(x)
lin_val = linear_approx(f, fp, a, x)
error = abs(true_val - lin_val)
print(f'{name:>12} {a:6.2f} {dx:6.2f} {true_val:12.6f} {lin_val:12.6f} {error:10.2e}')
print('\nPattern: error grows as O((x-a)^2) — the quadratic (second-order) term.')
print('The full story is in §04-Series-and-Sequences (Taylor series).')
20. Subgradients and Non-Smooth Functions
For non-differentiable convex functions (ReLU, |x|, L1 norm), subgradients generalize the derivative. Subgradient descent is used for L1 regularization.
Code cell 46
# === 20. Subgradients ===
import numpy as np
try:
import matplotlib.pyplot as plt
HAS_MPL = True
except:
HAS_MPL = False
# Subgradient of |x| at x=0 is any value in [-1,1]
# For optimization: use sign(x), subgradient 0 at x=0 is standard choice
x = np.linspace(-2, 2, 400)
f_abs = np.abs(x)
f_relu = np.maximum(0, x)
# Supporting hyperplanes at x=0 for |x|
x_supp = np.linspace(-1.5, 1.5, 50)
subgrads_to_show = [-1.0, -0.5, 0.0, 0.5, 1.0]
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(13, 5))
ax = axes[0]
ax.plot(x, f_abs, 'b-', lw=3, label='|x|', zorder=3)
colors = plt.cm.plasma(np.linspace(0.1, 0.9, len(subgrads_to_show)))
for g, c in zip(subgrads_to_show, colors):
y_supp = g * x_supp # supporting line: f(0) + g*(x-0) = g*x
ax.plot(x_supp, y_supp, '--', color=c, alpha=0.7, lw=1.5,
label=f'g={g:+.1f}')
ax.set_xlim(-1.5, 1.5); ax.set_ylim(-0.2, 1.5)
ax.set_xlabel('x'); ax.set_title('Subgradients of |x| at x=0')
ax.legend(fontsize=9)
ax2 = axes[1]
ax2.plot(x, f_relu, 'r-', lw=3, label='ReLU(x)')
for g, c in zip([0.0, 0.5, 1.0], ['blue','green','orange']):
y_supp = g * x_supp
ax2.plot(x_supp, y_supp, '--', color=c, alpha=0.7, label=f'g={g:.1f}')
ax2.set_xlim(-1.5, 1.5); ax2.set_ylim(-0.3, 1.5)
ax2.set_xlabel('x'); ax2.set_title('Subgradients of ReLU at x=0')
ax2.legend()
plt.tight_layout(); plt.show()
# Soft-thresholding (proximal operator of lambda*|x|)
lam = 0.3
def soft_threshold(v, lam):
return np.sign(v) * np.maximum(np.abs(v) - lam, 0)
v_test = np.array([-1.0, -0.2, 0.0, 0.15, 0.5, 1.0])
print('=== Soft-Thresholding (Proximal Operator of L1) ===')
print(f'lambda = {lam}')
for v in v_test:
print(f' v={v:+.2f} -> ST={soft_threshold(v, lam):+.2f}')
21. Modern Activations — SiLU, Swish, Mish
Post-ReLU activations used in modern transformers. SiLU/Swish = ; Mish = .
Code cell 48
# === 21. Modern Activations ===
import numpy as np
try:
import matplotlib.pyplot as plt
from scipy.special import erf
HAS_MPL = True
except:
HAS_MPL = False
def sigmoid(x): return 1/(1+np.exp(-np.clip(x,-500,500)))
# SiLU / Swish
def swish(x): return x * sigmoid(x)
def swish_prime(x):
s = sigmoid(x)
return s + x * s * (1 - s)
# Mish
def mish(x): return x * np.tanh(np.log(1+np.exp(np.clip(x,-500,500))))
def mish_prime(x):
sp = np.log(1+np.exp(np.clip(x,-500,500))) # softplus(x)
ts = np.tanh(sp)
sig = sigmoid(x)
return ts + x * (1 - ts**2) * sig
# GELU for comparison
def gelu(x): return x * 0.5 * (1 + erf(x/np.sqrt(2)))
def gelu_prime(x):
Phi = 0.5 * (1 + erf(x/np.sqrt(2)))
phi = np.exp(-x**2/2) / np.sqrt(2*np.pi)
return Phi + x*phi
# Verify derivatives
def centered(f, x, h=1e-6): return (f(x+h) - f(x-h)) / (2*h)
x = 1.5
print('=== Modern Activation Derivative Verification at x=1.5 ===')
for name, fn, fn_prime in [('Swish', swish, swish_prime), ('Mish', mish, mish_prime),
('GELU', gelu, gelu_prime)]:
a = fn_prime(x); n = centered(fn, x)
print(f" {'PASS' if abs(a-n)<1e-5 else 'FAIL'} {name}: analytic={a:.6f}, numerical={n:.6f}")
x_plot = np.linspace(-3, 3, 400)
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
for fn, nm, c in [(swish,'Swish','b'), (mish,'Mish','r'), (gelu,'GELU','g'),
(lambda x: np.maximum(0,x), 'ReLU', 'k')]:
axes[0].plot(x_plot, fn(x_plot), lw=2, color=c, label=nm)
axes[0].set_ylim(-0.5, 3); axes[0].set_title('Modern Activations')
axes[0].set_xlabel('x'); axes[0].legend()
for fn, nm, c in [(swish_prime,'Swish\'','b'), (mish_prime,'Mish\'','r'),
(gelu_prime,'GELU\'','g')]:
axes[1].plot(x_plot, fn(x_plot), lw=2, color=c, label=nm)
axes[1].set_ylim(-0.2, 1.2); axes[1].set_title('Activation Derivatives')
axes[1].set_xlabel('x'); axes[1].legend()
plt.tight_layout(); plt.show()
22. Gradient Descent — Convergence Analysis
For -smooth functions, learning rate guarantees descent. We verify the descent lemma numerically.
Code cell 50
# === 22. Gradient Descent Convergence ===
import numpy as np
try:
import matplotlib.pyplot as plt
HAS_MPL = True
except:
HAS_MPL = False
# Quadratic: f(theta) = L/2 * theta^2, L-smooth with L = L
L_smooth = 5.0
def f_quad(theta): return (L_smooth/2) * theta**2
def grad_quad(theta): return L_smooth * theta
theta0 = 3.0
steps = 30
print(f'=== Gradient Descent on f(theta) = ({L_smooth}/2)*theta^2 ===')
print(f'L-smooth constant L = {L_smooth}')
print(f'Critical eta: 2/L = {2/L_smooth:.4f}')
learning_rates = [0.1, 0.3, 1/L_smooth, 0.35, 0.4]
if HAS_MPL:
fig, ax = plt.subplots(figsize=(10, 6))
for eta in learning_rates:
theta = theta0
losses = [f_quad(theta)]
for _ in range(steps):
theta -= eta * grad_quad(theta)
losses.append(f_quad(theta))
converges = losses[-1] < 1e-6
print(f' eta={eta:.2f}: final loss={losses[-1]:.2e} ({"converges" if converges else "DIVERGES"})')
if HAS_MPL:
ax.semilogy(losses, lw=2, label=f'eta={eta:.2f}')
if HAS_MPL:
ax.axhline(1e-6, color='gray', ls=':', label='Convergence threshold')
ax.set_xlabel('Step'); ax.set_ylabel('Loss (log scale)')
ax.set_title(f'GD on Quadratic (L={L_smooth}) — Effect of Learning Rate')
ax.legend()
plt.tight_layout(); plt.show()
print(f'\nConclusion: eta >= 2/L = {2/L_smooth:.2f} diverges; eta < 2/L converges.')
23. Score Function and Fisher Information
The derivative of the log-likelihood is the score. Its variance is the Fisher information, which connects calculus to statistical estimation and second-order optimization.
Code cell 52
# === 23. Score Function and Fisher Information ===
import numpy as np
# Gaussian family: p(x; mu) = (1/sqrt(2pi)) * exp(-(x-mu)^2/2)
# log p = const - (x-mu)^2/2
# score = d/dmu log p = x - mu
# Fisher info: E[(score)^2] = E[(x-mu)^2] = Var[X] = 1 (for standard normal)
np.random.seed(42)
mu_true = 2.0
n_samples = 10000
X = np.random.normal(mu_true, 1.0, n_samples)
# Compute score and Fisher info empirically
def score(x, mu): return x - mu
def log_likelihood(X, mu): return -0.5*np.sum((X - mu)**2) - len(X)*0.5*np.log(2*np.pi)
# MLE: dL/dmu = 0 => sum(x_i - mu) = 0 => mu = xbar
mu_mle = np.mean(X)
# Score at MLE should be ~0 (optimality condition)
score_at_mle = np.mean(score(X, mu_mle))
# Fisher info: E[(x-mu)^2] should be Var[X] = 1
fisher_empirical = np.mean(score(X, mu_true)**2)
print('=== Score Function and Fisher Information ===')
print(f'True mu: {mu_true}, MLE mu_hat: {mu_mle:.4f}')
print(f'Score at MLE (should be ~0): {score_at_mle:.6f}')
print(f'Empirical Fisher info: {fisher_empirical:.4f} (true: 1.0)')
# Fisher-Rao connection: Fisher info = -E[d^2 log p/dmu^2]
# d^2/dmu^2 log p = -1 for Gaussian, so E[-(-1)] = 1
fisher_via_hessian = -np.mean([-1.0 for _ in X]) # = 1
print(f'Fisher via Hessian: {fisher_via_hessian:.4f} (Fisher identity: I = -E[f\'\'])')
# Cramer-Rao bound: Var[mu_hat] >= 1/I(mu)
print(f'\nCramer-Rao bound: Var[MLE] >= 1/I = {1/fisher_empirical:.4f}')
print(f'Actual Var[X_bar] = 1/n = {1/n_samples:.4f} (achieves bound — MLE is efficient)')
Summary
This notebook covered the full theory of differentiation:
| Section | Key Result |
|---|---|
| §1 Intuition | Secant → tangent line as |
| §2 Definition | |
| §3 Rules | Power, product, quotient, chain, elementary functions |
| §4 Chain Rule | — backbone of backprop |
| §5 Implicit | ; linear approximation |
| §6 Second Derivative | Concavity, inflection, at critical points |
| §7 Extrema / MVT | First/second derivative tests; Lipschitz bounds |
| §8 Activations | , ReLU', GELU'; vanishing gradients |
| §9 Numerical Diff | Forward , centered , optimal |
| §10 Proofs | Product rule, chain rule, differentiable→continuous |
| §11 Adam | First/second moment estimates; adaptive learning rates |
| §12 Backprop | Full 2-layer network chain rule derivation and verification |
| §13 Softmax+CE | |
| §14 Vanishing Grad | Exponential decay through sigmoid layers; ReLU advantage |
| §15 Stability | Numerically stable sigmoid, log-sigmoid, softmax |
Next: Integration →