Theory Notebook

Converted from theory.ipynb for web reading.

Derivatives and Differentiation

"The derivative is the instantaneous rate of change — and backpropagation is just the chain rule applied to a computation graph."

Interactive theory notebook covering: derivative definition, differentiation rules, chain rule, activation function derivatives, numerical differentiation, and optimization foundations.

Companion: notes.md | exercises.ipynb

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import numpy.linalg as la
from scipy import integrate, special, stats
from math import factorial
import matplotlib.patches as patches

COLORS = {
    "primary": "#0077BB",
    "secondary": "#EE7733",
    "tertiary": "#009988",
    "error": "#CC3311",
    "neutral": "#555555",
    "highlight": "#EE3377",
}
HAS_MPL = True
np.set_printoptions(precision=8, suppress=True)
np.random.seed(42)

def header(title):
    print("\n" + "=" * len(title))
    print(title)
    print("=" * len(title))

def check_true(name, cond):
    ok = bool(cond)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    return ok

def check_close(name, got, expected, tol=1e-8):
    ok = np.allclose(got, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}: got {got}, expected {expected}")
    return ok



def centered_diff(f, x, h=1e-6):
    return (f(x + h) - f(x - h)) / (2 * h)

def forward_diff(f, x, h=1e-6):
    return (f(x + h) - f(x)) / h

def backward_diff(f, x, h=1e-6):
    return (f(x) - f(x - h)) / h



def grad_check(f, x, analytic_grad, h=1e-6):
    x = np.asarray(x, dtype=float)
    analytic_grad = np.asarray(analytic_grad, dtype=float)
    numeric_grad = np.zeros_like(x, dtype=float)
    for idx in np.ndindex(x.shape):
        x_plus = x.copy(); x_minus = x.copy()
        x_plus[idx] += h; x_minus[idx] -= h
        numeric_grad[idx] = (f(x_plus) - f(x_minus)) / (2 * h)
    denom = la.norm(analytic_grad) + la.norm(numeric_grad) + 1e-12
    return la.norm(analytic_grad - numeric_grad) / denom



def check(name, got, expected, tol=1e-8):
    return check_close(name, got, expected, tol=tol)

print("Chapter helper setup complete.")

1. Intuition — The Derivative as Instantaneous Rate of Change

The derivative measures how fast a function changes at a point. We visualize the secant-to-tangent limit and explore what the derivative means geometrically.

Code cell 5

# === 1.1 Secant Lines Converging to Tangent ===

def f(x):
    return x**2

def f_prime(x):
    return 2*x

a = 1.0  # Point of tangency
h_values = [1.0, 0.5, 0.2, 0.05]

x = np.linspace(-0.5, 2.5, 300)

if HAS_MPL:
    colors = plt.cm.plasma(np.linspace(0.2, 0.9, len(h_values)))
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    ax = axes[0]
    ax.plot(x, f(x), 'k-', lw=2, label='$f(x) = x^2$')
    # Secant lines
    for i, h in enumerate(h_values):
        slope = (f(a+h) - f(a)) / h
        x_line = np.linspace(a-0.3, a+h+0.3, 50)
        y_line = f(a) + slope*(x_line - a)
        ax.plot(x_line, y_line, '--', color=colors[i], lw=1.5,
                label=f'h={h:.2f}, slope={slope:.2f}')
    # Tangent
    x_tang = np.linspace(a-0.8, a+0.8, 50)
    ax.plot(x_tang, f(a) + f_prime(a)*(x_tang - a), 'r-', lw=2.5,
            label=f"Tangent: slope={f_prime(a):.1f}")
    ax.plot(a, f(a), 'ro', ms=8)
    ax.set_xlim(-0.2, 2.2); ax.set_ylim(-0.2, 4.5)
    ax.set_xlabel('x'); ax.set_ylabel('f(x)')
    ax.set_title('Secant Lines → Tangent Line')
    ax.legend(fontsize=9)

    # Right panel: slope vs h
    ax2 = axes[1]
    h_fine = np.logspace(-4, 0, 200)
    slopes = [(f(a+h) - f(a)) / h for h in h_fine]
    ax2.semilogx(h_fine, slopes, 'b-', lw=2, label='Secant slope $(f(1+h)-f(1))/h$')
    ax2.axhline(f_prime(a), color='r', ls='--', lw=2, label=f"True $f'(1)$ = {f_prime(a)}")
    ax2.set_xlabel('Step size h'); ax2.set_ylabel('Slope')
    ax2.set_title('Convergence of Secant to Tangent')
    ax2.legend()

    plt.tight_layout()
    plt.savefig('/tmp/secant_tangent.png', dpi=100, bbox_inches='tight')
    plt.show()
    print('Figure saved.')

# Verify derivative
h = 1e-7
numerical = (f(1+h) - f(1)) / h
analytic = f_prime(1)
print(f'Numerical f\'(1): {numerical:.8f}')
print(f'Analytic  f\'(1): {analytic:.8f}')
print(f'PASS - error < 1e-6: {abs(numerical - analytic) < 1e-6}')

2. Formal Definition — Derivative as a Limit

f'(a) = \lim_{h \to 0} \frac{f(a+h) - f(a)}{h}

We compute this limit explicitly for several functions and verify symbolically.

Code cell 7

# === 2.1 Derivative from Definition — Worked Examples ===

# Example 1: f(x) = x^2, f'(x) = 2x
def limit_x2(x, h):
    return ((x+h)**2 - x**2) / h

# Example 2: f(x) = sqrt(x), f'(x) = 1/(2*sqrt(x))
def limit_sqrt(x, h):
    return (np.sqrt(x+h) - np.sqrt(x)) / h

# Example 3: f(x) = e^x, f'(x) = e^x
def limit_exp(x, h):
    return (np.exp(x+h) - np.exp(x)) / h

x0 = 2.0
h_vals = [0.1, 0.01, 0.001, 1e-6]

print('=== Convergence of Difference Quotients ===')
print(f'\nf(x) = x^2 at x={x0}  (true f\'={2*x0})')
for h in h_vals:
    print(f'  h={h:.0e}: {limit_x2(x0, h):.8f}')

print(f'\nf(x) = sqrt(x) at x={x0}  (true f\'={1/(2*np.sqrt(x0)):.8f})')
for h in h_vals:
    print(f'  h={h:.0e}: {limit_sqrt(x0, h):.8f}')

print(f'\nf(x) = e^x at x={x0}  (true f\'={np.exp(x0):.8f})')
for h in h_vals:
    print(f'  h={h:.0e}: {limit_exp(x0, h):.8f}')

Code cell 8

# === 2.2 Differentiable vs Continuous — ReLU and |x| ===

x = np.linspace(-2, 2, 400)

relu = np.maximum(0, x)
abs_x = np.abs(x)

# Difference quotients at x=0
h_vals = [0.5, 0.1, 0.01, 0.001]
print('=== Left and Right Derivatives at x=0 ===')
print('\nReLU:')
for h in h_vals:
    left = (np.maximum(0, -h) - 0) / (-h)
    right = (np.maximum(0, h) - 0) / h
    print(f'  h={h:.3f}: left={left:.3f}, right={right:.3f}')
print('  → right limit = 1, left limit = 0 → NOT differentiable (subgradient ∈ [0,1])')

print('\n|x|:')
for h in h_vals:
    left = (abs(-h) - 0) / (-h)
    right = (abs(h) - 0) / h
    print(f'  h={h:.3f}: left={left:.3f}, right={right:.3f}')
print('  → right limit = 1, left limit = -1 → NOT differentiable (corner)')

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    for ax, func, name in [(axes[0], relu, 'ReLU(x)'), (axes[1], abs_x, '|x|')]:
        ax.plot(x, func, 'b-', lw=2, label=name)
        ax.axvline(0, color='r', ls='--', alpha=0.5, label='x=0 (non-diff.)')
        ax.set_xlabel('x'); ax.set_ylabel('f(x)')
        ax.set_title(f'{name} — continuous but not differentiable at 0')
        ax.legend()
    plt.tight_layout(); plt.show()

3. Basic Differentiation Rules

The power, product, quotient, and sum rules let us differentiate any algebraic expression without computing limits. We verify each rule numerically.

Code cell 10

# === 3.1-3.5 Differentiation Rules — Numerical Verification ===

def centered_diff(f, x, h=1e-5):
    return (f(x+h) - f(x-h)) / (2*h)

def check(name, analytic_fn, numerical_fn, x):
    a = analytic_fn(x)
    n = numerical_fn(x)
    ok = abs(a - n) < 1e-6
    print(f"{'PASS' if ok else 'FAIL'} {name}: analytic={a:.8f}, numerical={n:.8f}")

x = 1.5
print('=== Derivative Rules — Verification at x=1.5 ===')

# Power rule: (x^5)' = 5x^4
check('Power rule x^5', lambda x: 5*x**4, lambda x: centered_diff(lambda t: t**5, x), x)

# Product rule: (x^2 * sin(x))' = 2x*sin(x) + x^2*cos(x)
check('Product rule x^2*sin(x)', 
      lambda x: 2*x*np.sin(x) + x**2*np.cos(x),
      lambda x: centered_diff(lambda t: t**2*np.sin(t), x), x)

# Quotient rule: (x^2/(x+1))' = (2x(x+1) - x^2)/(x+1)^2 = (x^2+2x)/(x+1)^2
check('Quotient rule x^2/(x+1)',
      lambda x: (x**2 + 2*x) / (x+1)**2,
      lambda x: centered_diff(lambda t: t**2/(t+1), x), x)

# Chain rule: sin(x^3)' = cos(x^3)*3x^2
check('Chain rule sin(x^3)',
      lambda x: np.cos(x**3)*3*x**2,
      lambda x: centered_diff(lambda t: np.sin(t**3), x), x)

# Logarithm: (ln(x))' = 1/x
check('Logarithm (ln x)',
      lambda x: 1/x,
      lambda x: centered_diff(np.log, x), x)

# Composite: e^(-x^2/2) - Gaussian kernel
check('Gaussian e^(-x^2/2)',
      lambda x: -x*np.exp(-x**2/2),
      lambda x: centered_diff(lambda t: np.exp(-t**2/2), x), x)

4. The Chain Rule

$(f \circ g)'(x) = f'(g(x)) \cdot g'(x)$

The chain rule is the mathematical foundation of backpropagation. Every node in a computation graph applies this rule to propagate gradients.

Code cell 12

# === 4.1-4.3 Chain Rule — Composition Analysis ===

def centered_diff(f, x, h=1e-6):
    return (f(x+h) - f(x-h)) / (2*h)

x_vals = np.linspace(-2, 2, 200)
x = 1.0

# Example 1: y = e^(-x^2/2) — Gaussian
f1 = lambda t: np.exp(-t**2/2)
f1_prime_analytic = lambda t: -t * np.exp(-t**2/2)
err1 = abs(centered_diff(f1, x) - f1_prime_analytic(x))
print(f'Gaussian derivative error: {err1:.2e}')

# Example 2: y = sin(ln(x)) for x>0
f2 = lambda t: np.sin(np.log(t))
f2_prime_analytic = lambda t: np.cos(np.log(t)) / t
err2 = abs(centered_diff(f2, 2.0) - f2_prime_analytic(2.0))
print(f'sin(ln(x)) derivative error: {err2:.2e}')

# Example 3: y = (1+x^2)^10
f3 = lambda t: (1+t**2)**10
f3_prime_analytic = lambda t: 20*t*(1+t**2)**9
err3 = abs(centered_diff(f3, x) - f3_prime_analytic(x))
print(f'(1+x^2)^10 derivative error: {err3:.2e}')

print('\nAll chain rule verifications PASS' if max(err1,err2,err3) < 1e-5 else 'CHECK FAILED')

# Scalar backprop demo
print('\n=== Scalar Backpropagation Demo ===')
print('Network: L = (sigma(w*x+b) - y)^2 / 2')
w, b, x_in, y = 2.0, -1.0, 1.5, 0.8

# Forward pass
z = w*x_in + b
sigma_z = 1 / (1 + np.exp(-z))
L = 0.5 * (sigma_z - y)**2

# Backward pass
dL_da = sigma_z - y
da_dz = sigma_z * (1 - sigma_z)
dL_dz = dL_da * da_dz
dL_dw = dL_dz * x_in
dL_db = dL_dz * 1

# Numerical check
num_dw = (0.5*(1/(1+np.exp(-(w+1e-5)*x_in+b))-y)**2 - 0.5*(1/(1+np.exp(-(w-1e-5)*x_in+b))-y)**2) / 2e-5
print(f'dL/dw — analytic: {dL_dw:.8f}, numerical: {num_dw:.8f}')
print(f'PASS: {abs(dL_dw - num_dw) < 1e-5}')

5. Implicit Differentiation and Linear Approximation

When $y$ is defined implicitly by $F(x,y)=0$ , differentiate both sides w.r.t. $x$ treating $y=y(x)$ . Linear approximation $f(x)\approx f(a)+f'(a)(x-a)$ is the derivative's most direct application.

Code cell 14

# === 5.1-5.3 Implicit Differentiation and Linear Approximation ===

import numpy as np

# --- Implicit: circle x^2 + y^2 = r^2 ---
# dy/dx = -x/y
r = 3.0
theta = np.linspace(0, 2*np.pi, 400)
x_circ = r * np.cos(theta)
y_circ = r * np.sin(theta)

# At point (x0, y0) on upper semicircle
x0 = 2.0
y0 = np.sqrt(r**2 - x0**2)
slope_implicit = -x0 / y0

# Verify numerically: y = sqrt(r^2 - x^2), dy/dx = -x/sqrt(r^2-x^2)
slope_numerical = (-x0 / np.sqrt(r**2 - x0**2))
print(f'Circle implicit slope at ({x0:.1f}, {y0:.3f}): {slope_implicit:.6f}')
print(f'Numerical check: {slope_numerical:.6f}')
print(f'PASS: {abs(slope_implicit - slope_numerical) < 1e-10}')

# --- Linear approximation demo ---
print('\n=== Linear Approximation: sin(x) near x=pi/4 ===')
a = np.pi / 4
f_a = np.sin(a)
f_prime_a = np.cos(a)

x_test_vals = [a + dx for dx in [-0.3, -0.1, 0.1, 0.3]]
print(f'Linearization: f(x) ≈ {f_a:.4f} + {f_prime_a:.4f}*(x - {a:.4f})')
print(f'{"x":>8} {"True f(x)":>12} {"Linear approx":>14} {"Error":>10}')
for x in x_test_vals:
    true_val = np.sin(x)
    linear_val = f_a + f_prime_a*(x - a)
    error = abs(true_val - linear_val)
    print(f'{x:8.4f} {true_val:12.6f} {linear_val:14.6f} {error:10.2e}')

# Cross-entropy gradient via related rates
print('\n=== Cross-Entropy Gradient via Chain Rule ===')
z = 1.5
sigma_z = 1/(1+np.exp(-z))
# L = -log(sigma(z)), dL/dz = sigma(z) - 1
dL_dz_analytic = sigma_z - 1

h = 1e-6
L_plus = -np.log(1/(1+np.exp(-(z+h))))
L_minus = -np.log(1/(1+np.exp(-(z-h))))
dL_dz_numerical = (L_plus - L_minus) / (2*h)
print(f'dL/dz = sigma(z)-1: analytic={dL_dz_analytic:.8f}, numerical={dL_dz_numerical:.8f}')
print(f'PASS: {abs(dL_dz_analytic - dL_dz_numerical) < 1e-7}')

6. Higher-Order Derivatives and Concavity

$f''(x) > 0$ means concave up; $f''(x) < 0$ means concave down. The second derivative determines local curvature of the loss surface.

Code cell 16

# === 6.1-6.4 Higher-Order Derivatives ===

try:
    import matplotlib.pyplot as plt
    HAS_MPL = True
except ImportError:
    HAS_MPL = False

x = np.linspace(-3, 3, 500)

# f = x^4 - 4x^2 (W-shape — two minima)
f_vals = x**4 - 4*x**2
fp_vals = 4*x**3 - 8*x
fpp_vals = 12*x**2 - 8

# Critical points: f' = 0 => 4x^3 - 8x = 4x(x^2-2) = 0 => x in {0, ±sqrt(2)}
crit = [0, np.sqrt(2), -np.sqrt(2)]
print('=== Critical Points of f(x) = x^4 - 4x^2 ===')
for c in crit:
    fpc = 4*c**3 - 8*c
    fppc = 12*c**2 - 8
    kind = 'local min' if fppc > 0 else ('local max' if fppc < 0 else 'inconclusive')
    print(f"  x={c:+.4f}: f={c**4-4*c**2:.4f}, f_prime={fpc:.4f}, f_double_prime={fppc:.4f} -> {kind}")

# Inflection points: f'' = 0 => 12x^2 = 8 => x = ±sqrt(2/3)
infl = np.sqrt(2/3)
print(f'\nInflection points at x = ±{infl:.4f}')

if HAS_MPL:
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    for ax, vals, name, color in zip(axes,
        [f_vals, fp_vals, fpp_vals],
        ['f(x) = x⁴ - 4x²', "f'(x)", "f''(x)"],
        ['blue', 'green', 'red']):
        ax.plot(x, vals, color=color, lw=2, label=name)
        ax.axhline(0, color='k', lw=0.8)
        ax.set_xlim(-3, 3); ax.set_ylim(-5, 10)
        ax.set_xlabel('x'); ax.set_title(name)
        ax.legend()
    plt.suptitle('Function, First, and Second Derivative', fontsize=13)
    plt.tight_layout(); plt.show()

7. Extrema, Critical Points, and the Mean Value Theorem

The MVT guarantees a point where the instantaneous rate equals the average rate. This underlies Lipschitz bounds and gradient descent convergence theory.

Code cell 18

# === 7.1-7.4 Extrema and MVT ===

# --- Global extrema on closed interval ---
def f(x):
    return x**3 - 3*x

def fp(x):
    return 3*x**2 - 3

a, b = -2.0, 2.5

# Critical points in (a,b): f'(x) = 0 => x = ±1
crit = [-1.0, 1.0]
candidates = [a] + [c for c in crit if a < c < b] + [b]

print('=== Global Extrema of f(x) = x^3 - 3x on [-2, 2.5] ===')
print(f'{"Point":>8} {"f(x)":>10} {"Type":>15}')
f_vals = [(x, f(x)) for x in candidates]
min_x = min(f_vals, key=lambda p: p[1])
max_x = max(f_vals, key=lambda p: p[1])
for pt, val in f_vals:
    kind = 'Global MIN' if pt == min_x[0] else ('Global MAX' if pt == max_x[0] else 'Candidate')
    print(f'{pt:8.3f} {val:10.4f} {kind:>15}')

# --- MVT illustration ---
print('\n=== Mean Value Theorem ===')
x_mvt = np.linspace(a, b, 1000)
avg_slope = (f(b) - f(a)) / (b - a)
print(f'Average slope [f({b})-f({a})] / ({b}-{a}) = {avg_slope:.4f}')

# Find c where f'(c) = avg_slope: 3c^2 - 3 = avg_slope => c = sqrt((avg_slope+3)/3)
c_mvt = np.sqrt((avg_slope + 3) / 3)
print(f'MVT guarantees c in ({a},{b}) where f\'(c) = {avg_slope:.4f}')
print(f'Found c = {c_mvt:.4f}, f\'({c_mvt:.4f}) = {fp(c_mvt):.4f}')
print(f'PASS — MVT c in interval: {a < c_mvt < b}')

8. Activation Function Derivatives

Activation functions and their derivatives control gradient flow in neural networks. The vanishing gradient problem arises from small $|f'|$ values compounding through layers.

Code cell 20

# === 8.1-8.5 Activation Derivatives ===

try:
    import matplotlib.pyplot as plt
    HAS_MPL = True
except:
    HAS_MPL = False

x = np.linspace(-5, 5, 500)

# Sigmoid
def sigmoid(x): return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
def sigmoid_prime(x): s = sigmoid(x); return s * (1 - s)

# Tanh
def tanh_prime(x): return 1 - np.tanh(x)**2

# ReLU
def relu(x): return np.maximum(0, x)
def relu_prime(x): return np.where(np.asarray(x) > 0, 1.0, 0.0)

# GELU
from scipy.special import erf
def gelu(x): return x * 0.5 * (1 + erf(x / np.sqrt(2)))
def gelu_prime(x):
    Phi = 0.5 * (1 + erf(x / np.sqrt(2)))
    phi = np.exp(-x**2/2) / np.sqrt(2*np.pi)
    return Phi + x * phi

# Numerical verification of derivatives at x=1.5
x0 = 1.5
h = 1e-6
def num_deriv(f, x): return (f(x+h) - f(x-h)) / (2*h)

print('=== Activation Derivative Verification at x=1.5 ===')
tests = [
    ('Sigmoid', sigmoid, sigmoid_prime),
    ('Tanh', np.tanh, tanh_prime),
    ('ReLU', relu, relu_prime),
    ('GELU', gelu, gelu_prime),
]
for name, fn, fn_prime in tests:
    a = fn_prime(x0)
    n = num_deriv(fn, x0)
    ok = abs(a - n) < 1e-5
    print(f"  {'PASS' if ok else 'FAIL'} {name}: analytic={a:.6f}, numerical={n:.6f}")

# Vanishing gradient analysis
print('\n=== Vanishing Gradient — Sigmoid Product Through L Layers ===')
z_typical = 2.0  # typical saturated input
sig_prime_val = sigmoid_prime(z_typical)
print(f'sigma\'({z_typical}) = {sig_prime_val:.6f}')
for L in [5, 10, 20, 50]:
    product = sig_prime_val**L
    print(f'  {L} layers: (sigma\')^{L} = {product:.2e}')

Code cell 21

# === 8.5 Activation Derivatives — Visualization ===

try:
    import matplotlib.pyplot as plt
    from scipy.special import erf
    HAS_MPL = True
except:
    HAS_MPL = False

import numpy as np

def sigmoid(x): return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
def sigmoid_prime(x): s = sigmoid(x); return s * (1 - s)
def relu_prime(x): return np.where(np.asarray(x) > 0, 1.0, 0.0)
def gelu(x): return x * 0.5 * (1 + erf(x / np.sqrt(2)))
def gelu_prime(x):
    Phi = 0.5 * (1 + erf(x / np.sqrt(2)))
    phi = np.exp(-x**2/2) / np.sqrt(2*np.pi)
    return Phi + x * phi

x = np.linspace(-4, 4, 400)

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    # Left: activation functions
    ax = axes[0]
    ax.plot(x, sigmoid(x), label='Sigmoid', lw=2)
    ax.plot(x, np.tanh(x), label='Tanh', lw=2)
    ax.plot(x, np.maximum(0, x), label='ReLU', lw=2)
    ax.plot(x, gelu(x), label='GELU', lw=2, ls='--')
    ax.set_xlabel('x'); ax.set_ylabel('f(x)')
    ax.set_title('Activation Functions')
    ax.legend(); ax.set_ylim(-1.5, 4)

    # Right: derivatives
    ax2 = axes[1]
    ax2.plot(x, sigmoid_prime(x), label="Sigmoid'", lw=2)
    ax2.plot(x, 1 - np.tanh(x)**2, label="Tanh'", lw=2)
    ax2.plot(x, relu_prime(x), label="ReLU'", lw=2)
    ax2.plot(x, gelu_prime(x), label="GELU'", lw=2, ls='--')
    ax2.axhline(0.25, color='gray', ls=':', lw=1, label='1/4 (sigmoid max)')
    ax2.set_xlabel('x'); ax2.set_ylabel("f'(x)")
    ax2.set_title('Activation Derivatives')
    ax2.legend(); ax2.set_ylim(-0.2, 1.2)

    plt.tight_layout(); plt.show()
    print('Activation derivatives plotted.')
print('Peak gradient — Sigmoid:', f"{max(sigmoid_prime(x)):.4f} at x=0")
print('Peak gradient — Tanh:   ', f"{max(1-np.tanh(x)**2):.4f} at x=0")
print('Constant grad — ReLU:  1.0 for x>0 (no saturation)')

9. Numerical Differentiation

Finite differences approximate derivatives numerically. Two errors compete: truncation (decreases with $h$ ) and round-off (increases as $h\to 0$ ). The centered difference $O(h^2)$ is strongly preferred over the forward difference $O(h)$ .

Code cell 23

# === 9.1-9.2 Finite Differences — Error Analysis ===

import numpy as np

try:
    import matplotlib.pyplot as plt
    HAS_MPL = True
except:
    HAS_MPL = False

def f(x): return np.sin(x)
def f_prime(x): return np.cos(x)

x0 = 1.0
true_deriv = f_prime(x0)

h_vals = np.logspace(-16, 0, 160)

forward_errors = []
centered_errors = []

for h in h_vals:
    fwd = (f(x0+h) - f(x0)) / h
    ctr = (f(x0+h) - f(x0-h)) / (2*h)
    forward_errors.append(abs(fwd - true_deriv))
    centered_errors.append(abs(ctr - true_deriv))

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.loglog(h_vals, forward_errors, 'b-', lw=2, label='Forward diff O(h)')
    ax.loglog(h_vals, centered_errors, 'r-', lw=2, label='Centered diff O(h²)')
    # Reference slopes
    h_ref = np.array([1e-8, 1e-4])
    ax.loglog(h_ref, h_ref, 'b--', alpha=0.5, label='O(h) slope')
    ax.loglog(h_ref, h_ref**2 * 1e4, 'r--', alpha=0.5, label='O(h²) slope')
    ax.set_xlabel('Step size h')
    ax.set_ylabel('|Error|')
    ax.set_title('Finite Difference Error vs Step Size (sin(x) at x=1)')
    ax.legend()
    plt.tight_layout(); plt.show()

# Find optimal h
opt_fwd_idx = np.argmin(forward_errors)
opt_ctr_idx = np.argmin(centered_errors)
print(f'Optimal h for forward diff: {h_vals[opt_fwd_idx]:.2e} (error: {forward_errors[opt_fwd_idx]:.2e})')
print(f'Optimal h for centered diff: {h_vals[opt_ctr_idx]:.2e} (error: {centered_errors[opt_ctr_idx]:.2e})')
print(f'Theoretical optimal centered: ~6e-6 (eps_mach^(1/3))')

Code cell 24

# === 9.3 Gradient Checking — Verifying Analytical Gradients ===

import numpy as np

def grad_check_scalar(f, x, analytic_grad, h=1e-5):
    """Check gradient of scalar-to-scalar function f."""
    numerical = (f(x+h) - f(x-h)) / (2*h)
    rel_error = abs(analytic_grad - numerical) / (abs(analytic_grad) + abs(numerical) + 1e-10)
    return rel_error, numerical

def grad_check_vector(f, x, analytic_grad, h=1e-5):
    """Check gradient of vector-to-scalar function f."""
    n = len(x)
    numerical_grad = np.zeros(n)
    for i in range(n):
        x_plus = x.copy(); x_plus[i] += h
        x_minus = x.copy(); x_minus[i] -= h
        numerical_grad[i] = (f(x_plus) - f(x_minus)) / (2*h)
    rel_error = np.linalg.norm(analytic_grad - numerical_grad) / \
                (np.linalg.norm(analytic_grad) + np.linalg.norm(numerical_grad) + 1e-10)
    return rel_error, numerical_grad

print('=== Gradient Checking Examples ===')

# Example 1: f(x) = x*ln(x+e^(-x))
def f1(x): return x * np.log(x + np.exp(-x))
def f1_prime(x): return np.log(x + np.exp(-x)) + x * (1 - np.exp(-x)) / (x + np.exp(-x))

x_test = 1.5
err1, num1 = grad_check_scalar(f1, x_test, f1_prime(x_test))
print(f'f(x) = x*ln(x+e^-x): analytic={f1_prime(x_test):.6f}, numerical={num1:.6f}')
print(f'  Relative error: {err1:.2e} → {"PASS" if err1 < 1e-5 else "FAIL"}')

# Example 2: L2 loss gradient
def loss(w):
    x = np.array([1.0, 2.0, -1.0])
    y = 1.5
    return 0.5 * (np.dot(w, x) - y)**2

def loss_grad(w):
    x = np.array([1.0, 2.0, -1.0])
    y = 1.5
    return (np.dot(w, x) - y) * x

w = np.array([0.5, -0.3, 1.2])
err2, num_grad = grad_check_vector(loss, w, loss_grad(w))
print(f'\nL2 loss gradient:')
print(f'  Analytic: {loss_grad(w)}')
print(f'  Numerical: {num_grad}')
print(f'  Relative error: {err2:.2e} → {"PASS" if err2 < 1e-5 else "FAIL"}')

10. Key Proofs — Interactive Verification

We verify the main derivative proofs computationally, checking that the theoretical formulas match numerical estimates to machine precision.

Code cell 26

# === 10. Proof Verification ===

import numpy as np

def centered(f, x, h=1e-7):
    return (f(x+h) - f(x-h)) / (2*h)

print('=== Verifying All Main Derivative Rules ===')

tests = [
    # (name, analytic_fn, numerical_fn, x)
    ('Power x^7', lambda x: 7*x**6, lambda x: centered(lambda t: t**7, x), 1.5),
    ('sin(x)', np.cos, lambda x: centered(np.sin, x), 1.2),
    ('cos(x)', lambda x: -np.sin(x), lambda x: centered(np.cos, x), 0.7),
    ('e^x', np.exp, lambda x: centered(np.exp, x), 2.0),
    ('ln(x)', lambda x: 1/x, lambda x: centered(np.log, x), 3.0),
    ('arctan(x)', lambda x: 1/(1+x**2), lambda x: centered(np.arctan, x), 1.0),
    ('arcsin(x)', lambda x: 1/np.sqrt(1-x**2), lambda x: centered(np.arcsin, x), 0.5),
    ('x^x (log diff)', lambda x: x**x*(np.log(x)+1), lambda x: centered(lambda t: t**t, x), 2.0),
    ('x^2*e^x (product)', lambda x: x**2*np.exp(x) + 2*x*np.exp(x), 
                          lambda x: centered(lambda t: t**2*np.exp(t), x), 1.5),
    ('sin(e^x) (chain)', lambda x: np.cos(np.exp(x))*np.exp(x),
                          lambda x: centered(lambda t: np.sin(np.exp(t)), x), 0.5),
    ('Softplus derivative=sigma', lambda x: 1/(1+np.exp(-x)),
                                   lambda x: centered(lambda t: np.log(1+np.exp(t)), x), 1.0),
]

all_pass = True
for name, analytic, numerical, x in tests:
    a = analytic(x); n = numerical(x)
    err = abs(a - n) / (abs(a) + 1e-10)
    ok = err < 1e-5
    all_pass = all_pass and ok
    print(f"  {'PASS' if ok else 'FAIL'} {name}: err={err:.2e}")

print(f'\nAll tests: {"PASS" if all_pass else "SOME FAILED"}')

11. Application — Adam Optimizer Analysis

The Adam optimizer uses first and second derivative moments to adapt learning rates. We simulate Adam on a quadratic and visualize convergence compared to vanilla SGD.

Code cell 28

# === 11. Adam Optimizer — Single Variable Simulation ===

import numpy as np

try:
    import matplotlib.pyplot as plt
    HAS_MPL = True
except:
    HAS_MPL = False

# Loss: f(theta) = (theta - 3)^2 + 0.1*sin(5*theta)
def loss(theta): return (theta - 3)**2 + 0.1*np.sin(5*theta)
def grad(theta): return 2*(theta - 3) + 0.5*np.cos(5*theta)

def run_sgd(theta0, lr=0.1, steps=200):
    theta = theta0; trajectory = [theta0]
    for _ in range(steps):
        theta -= lr * grad(theta)
        trajectory.append(theta)
    return np.array(trajectory)

def run_adam(theta0, lr=0.1, b1=0.9, b2=0.999, eps=1e-8, steps=200):
    theta = theta0; m = 0.0; v = 0.0
    trajectory = [theta0]
    for t in range(1, steps+1):
        g = grad(theta)
        m = b1*m + (1-b1)*g
        v = b2*v + (1-b2)*g**2
        m_hat = m / (1 - b1**t)
        v_hat = v / (1 - b2**t)
        theta -= lr * m_hat / (np.sqrt(v_hat) + eps)
        trajectory.append(theta)
    return np.array(trajectory)

theta0 = -2.0
sgd_traj = run_sgd(theta0, lr=0.1)
adam_traj = run_adam(theta0, lr=0.5)

print('=== Optimizer Comparison ===')
print(f'True minimum near theta=3 (ignoring sin perturbation)')
print(f'SGD  final theta: {sgd_traj[-1]:.6f}, loss: {loss(sgd_traj[-1]):.6f}')
print(f'Adam final theta: {adam_traj[-1]:.6f}, loss: {loss(adam_traj[-1]):.6f}')

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    steps = np.arange(len(sgd_traj))
    axes[0].plot(steps, loss(sgd_traj), 'b-', lw=2, label='SGD')
    axes[0].plot(steps, loss(adam_traj), 'r-', lw=2, label='Adam')
    axes[0].set_xlabel('Step'); axes[0].set_ylabel('Loss')
    axes[0].set_title('Loss vs Training Step')
    axes[0].legend(); axes[0].set_yscale('log')

    theta_plot = np.linspace(-3, 5, 300)
    axes[1].plot(theta_plot, loss(theta_plot), 'k-', lw=2, label='Loss surface')
    axes[1].plot(sgd_traj[:50], loss(sgd_traj[:50]), 'b.-', ms=4, alpha=0.7, label='SGD path')
    axes[1].plot(adam_traj[:50], loss(adam_traj[:50]), 'r.-', ms=4, alpha=0.7, label='Adam path')
    axes[1].set_xlabel('theta'); axes[1].set_ylabel('Loss')
    axes[1].set_title('Optimizer Trajectories on Loss Surface (first 50 steps)')
    axes[1].legend()
    plt.tight_layout(); plt.show()

12. Backpropagation — Chain Rule on Computation Graphs

We implement a complete 2-layer network forward/backward pass, verify gradients against numerical estimates, and visualize gradient flow through layers.

Code cell 30

# === 12. Full 2-Layer Network Backprop ===

import numpy as np

def sigmoid(x): return 1/(1+np.exp(-x))
def sigmoid_prime(x): s = sigmoid(x); return s*(1-s)

class TwoLayerNet:
    def __init__(self, w1, b1, w2, b2):
        self.w1 = w1; self.b1 = b1
        self.w2 = w2; self.b2 = b2

    def forward(self, x, y):
        self.x = x
        self.z1 = self.w1*x + self.b1
        self.a1 = sigmoid(self.z1)
        self.z2 = self.w2*self.a1 + self.b2
        self.a2 = sigmoid(self.z2)
        self.loss = 0.5*(self.a2 - y)**2
        return self.loss

    def backward(self):
        # Layer 2
        dL_da2 = self.a2 - self.y if hasattr(self,'y') else None
        # Recompute with stored y
        return None  # see below

# Manual backprop
def backprop(x, y, w1, b1, w2, b2):
    # Forward
    z1 = w1*x + b1
    a1 = sigmoid(z1)
    z2 = w2*a1 + b2
    a2 = sigmoid(z2)
    L = 0.5*(a2 - y)**2

    # Backward
    dL_da2 = a2 - y
    da2_dz2 = sigmoid_prime(z2)
    dL_dz2 = dL_da2 * da2_dz2

    dL_dw2 = dL_dz2 * a1
    dL_db2 = dL_dz2

    dL_da1 = dL_dz2 * w2
    da1_dz1 = sigmoid_prime(z1)
    dL_dz1 = dL_da1 * da1_dz1

    dL_dw1 = dL_dz1 * x
    dL_db1 = dL_dz1

    return L, dict(w1=dL_dw1, b1=dL_db1, w2=dL_dw2, b2=dL_db2)

# Test values
x, y, w1, b1, w2, b2 = 1.5, 0.7, 2.0, -0.5, 1.2, 0.3
L, grads = backprop(x, y, w1, b1, w2, b2)

print('=== 2-Layer Network Gradient Check ===')
print(f'Loss: {L:.6f}')

# Numerical check via parameter perturbation
h = 1e-5
for param_name, param_val, idx in [('w1',w1,0),('b1',b1,1),('w2',w2,2),('b2',b2,3)]:
    params = [w1, b1, w2, b2]
    p_plus = params[:]; p_plus[idx] += h
    p_minus = params[:]; p_minus[idx] -= h
    L_p = backprop(x, y, *p_plus)[0]
    L_m = backprop(x, y, *p_minus)[0]
    numerical = (L_p - L_m) / (2*h)
    analytic = grads[param_name]
    ok = abs(analytic - numerical) < 1e-6
    print(f"  {'PASS' if ok else 'FAIL'} d{param_name}: analytic={analytic:.8f}, "
          f"numerical={numerical:.8f}")

13. Softmax and Cross-Entropy Gradient

The gradient of cross-entropy loss with respect to softmax logits has a beautifully simple form: $\partial\mathcal{L}/\partial z_j = p_j - y_j$ .

Code cell 32

# === 13. Softmax + Cross-Entropy Gradient ===

import numpy as np

def softmax(z):
    z_stable = z - np.max(z)  # numerical stability
    e = np.exp(z_stable)
    return e / e.sum()

def cross_entropy(p, y):
    return -np.sum(y * np.log(p + 1e-12))

def softmax_ce_grad(z, y):
    """Gradient of CE loss w.r.t. logits z — should equal p - y."""
    p = softmax(z)
    return p - y

# Test
K = 5
np.random.seed(42)
z = np.random.randn(K)
y = np.zeros(K); y[2] = 1.0  # true class = 2

p = softmax(z)
L = cross_entropy(p, y)
analytic_grad = softmax_ce_grad(z, y)

# Numerical gradient
h = 1e-5
numerical_grad = np.zeros(K)
for i in range(K):
    z_p = z.copy(); z_p[i] += h
    z_m = z.copy(); z_m[i] -= h
    numerical_grad[i] = (cross_entropy(softmax(z_p), y) - cross_entropy(softmax(z_m), y)) / (2*h)

print('=== Softmax + Cross-Entropy Gradient ===')
print(f'Logits z: {z}')
print(f'Probs  p: {p}')
print(f'Loss   L: {L:.6f}')
print(f'\nGradient comparison (should be p - y):')
print(f'  p - y:     {p - y}')
print(f'  Analytic:  {analytic_grad}')
print(f'  Numerical: {numerical_grad}')
err = np.linalg.norm(analytic_grad - numerical_grad)
print(f'\nPASS - gradient error: {err:.2e}' if err < 1e-5 else f'FAIL - error: {err:.2e}')

14. Vanishing Gradients — Quantitative Analysis

We simulate gradient propagation through $L$ sigmoid layers and observe exponential decay. This motivates ReLU and residual connections in deep networks.

Code cell 34

# === 14. Vanishing Gradient Simulation ===

import numpy as np

try:
    import matplotlib.pyplot as plt
    HAS_MPL = True
except:
    HAS_MPL = False

def sigmoid(x): return 1/(1+np.exp(-x))
def sigmoid_prime(x): s = sigmoid(x); return s*(1-s)
def relu_prime(x): return float(x > 0)

np.random.seed(42)
max_layers = 50

# Simulate gradient magnitude at layer 1 as function of depth L
sigmoid_grads = []
relu_grads = []

for L in range(1, max_layers+1):
    # Random pre-activations (sigmoid tends to saturate)
    z_sigmoid = np.random.randn(L) * 2  # some saturation
    z_relu = np.abs(np.random.randn(L))  # mostly positive

    grad_sig = np.prod([sigmoid_prime(z) for z in z_sigmoid])
    grad_relu = np.prod([relu_prime(z) for z in z_relu])

    sigmoid_grads.append(abs(grad_sig))
    relu_grads.append(abs(grad_relu))

layers = np.arange(1, max_layers+1)

print('=== Gradient Magnitude at Layer 1 vs Network Depth ===')
for L in [1, 5, 10, 20, 50]:
    print(f'  L={L:2d}: sigmoid={sigmoid_grads[L-1]:.2e}, relu={relu_grads[L-1]:.2e}')

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.semilogy(layers, sigmoid_grads, 'b-', lw=2, label='Sigmoid activations')
    ax.semilogy(layers, relu_grads, 'r-', lw=2, label='ReLU activations')
    ax.axhline(1e-7, color='gray', ls=':', label='Float32 precision floor')
    ax.set_xlabel('Number of layers L')
    ax.set_ylabel('|Gradient at layer 1|')
    ax.set_title('Vanishing Gradient: Sigmoid vs ReLU')
    ax.legend()
    plt.tight_layout(); plt.show()
    print('\nConclusion: sigmoid gradients vanish exponentially;')
    print('ReLU maintains O(1) gradient for active units.')

15. Numerically Stable Derivative Computations

Floating-point arithmetic introduces subtle errors. We compare naive vs. stable implementations for sigmoid, log-sigmoid, and softmax.

Code cell 36

# === 15. Numerical Stability ===

import numpy as np

# Naive sigmoid — overflows for large negative x
def sigmoid_naive(x): return 1 / (1 + np.exp(-x))

# Stable sigmoid
def sigmoid_stable(x):
    return np.where(x >= 0,
                    1 / (1 + np.exp(-x)),
                    np.exp(x) / (1 + np.exp(x)))

# Naive log-sigmoid
def log_sigmoid_naive(x): return np.log(sigmoid_naive(x))

# Stable log-sigmoid
def log_sigmoid_stable(x): return -np.logaddexp(0, -x)

test_vals = [-1000, -100, -10, 0, 10, 100, 1000]

print('=== Sigmoid Stability ===')
print(f'{"x":>8} {"Naive":>15} {"Stable":>15} {"True":>15}')
for x in test_vals:
    naive = sigmoid_naive(np.float64(x))
    stable = sigmoid_stable(np.float64(x))
    true_val = 1/(1+np.exp(-np.float64(x))) if abs(x) < 700 else (1.0 if x > 0 else 0.0)
    print(f'{x:8.0f} {naive:15.8f} {stable:15.8f} {true_val:15.8f}')

print('\n=== Log-Sigmoid Stability ===')
print(f'{"x":>8} {"Naive":>15} {"Stable":>15}')
for x in [-100, -10, 0, 10, 100]:
    with np.errstate(divide='ignore', invalid='ignore'):
        naive = log_sigmoid_naive(np.float64(x))
    stable = log_sigmoid_stable(np.float64(x))
    print(f'{x:8.0f} {naive:15.8f} {stable:15.8f}')
print('\nNaive gives -inf or 0 for large |x|; stable is correct throughout.')

16. Logarithmic Differentiation

When the function is a product/quotient of many terms, taking $\ln$ first and differentiating implicitly is often much simpler.

Code cell 38

# === 16. Logarithmic Differentiation ===

import numpy as np

def centered(f, x, h=1e-6): return (f(x+h) - f(x-h)) / (2*h)

# f(x) = x^x
# ln f = x ln x => f'/f = ln x + 1 => f' = x^x * (ln x + 1)
def f_xx(x): return x**x
def f_xx_prime(x): return x**x * (np.log(x) + 1)

x = 2.0
print(f'x^x derivative at x=2:')
print(f'  Analytic: {f_xx_prime(x):.8f}')
print(f'  Numerical: {centered(f_xx, x):.8f}')
print(f'  PASS: {abs(f_xx_prime(x) - centered(f_xx, x)) < 1e-5}')

# f(x) = (sin x)^x for x > 0
# ln f = x ln(sin x) => f'/f = ln(sin x) + x*cos(x)/sin(x)
def f_sinx_x(x): return np.sin(x)**x
def f_sinx_x_prime(x):
    return np.sin(x)**x * (np.log(np.sin(x)) + x*np.cos(x)/np.sin(x))

x = 1.5
print(f'\n(sin x)^x derivative at x=1.5:')
print(f'  Analytic: {f_sinx_x_prime(x):.8f}')
print(f'  Numerical: {centered(f_sinx_x, x):.8f}')
print(f'  PASS: {abs(f_sinx_x_prime(x) - centered(f_sinx_x, x)) < 1e-5}')

# Demonstrate for multi-factor expression
# g(x) = (x^3 * sqrt(x+1)) / (x^2+2)^4
def g(x): return (x**3 * np.sqrt(x+1)) / (x**2+2)**4
def g_prime(x):
    # ln g = 3 ln x + 0.5 ln(x+1) - 4 ln(x^2+2)
    # g'/g = 3/x + 1/(2(x+1)) - 8x/(x^2+2)
    return g(x) * (3/x + 1/(2*(x+1)) - 8*x/(x**2+2))

x = 1.5
print(f'\nMulti-factor g(x) at x=1.5:')
print(f'  Analytic: {g_prime(x):.8f}')
print(f'  Numerical: {centered(g, x):.8f}')
print(f'  PASS: {abs(g_prime(x) - centered(g, x)) < 1e-5}')

17. Inverse Function Theorem

If $f'(a)\neq 0$ , then $f^{-1}$ is differentiable at $b=f(a)$ with $(f^{-1})'(b) = 1/f'(a)$ . We use this to derive all inverse trig derivatives.

Code cell 40

# === 17. Inverse Function Theorem ===

import numpy as np

def centered(f, x, h=1e-6): return (f(x+h) - f(x-h)) / (2*h)

# arcsin: (arcsin x)' = 1/sqrt(1-x^2)
def arcsin_prime(x): return 1/np.sqrt(1-x**2)

# arctan: (arctan x)' = 1/(1+x^2)
def arctan_prime(x): return 1/(1+x**2)

test_pts = [0.0, 0.3, -0.5]
print('=== Inverse Trig Derivatives ===')
for x in test_pts:
    a_asin = arcsin_prime(x); n_asin = centered(np.arcsin, x)
    a_atan = arctan_prime(x); n_atan = centered(np.arctan, x)
    print(f'x={x:+.1f}: arcsin\' analytic={a_asin:.6f}, numerical={n_asin:.6f} '
          f'| arctan\' analytic={a_atan:.6f}, numerical={n_atan:.6f}')

print('\nAll PASS:', all(
    abs(arcsin_prime(x) - centered(np.arcsin, x)) < 1e-5 and
    abs(arctan_prime(x) - centered(np.arctan, x)) < 1e-5
    for x in test_pts
))

# IFT: verify (f^-1)'(f(x)) = 1/f'(x) for f = exp
print('\nIFT check: (ln)\' at y = e^x = f(x), should equal 1/f\'(x) = 1/e^x')
for x in [0.5, 1.0, 2.0]:
    y = np.exp(x)
    inv_prime_at_y = 1/y  # (ln)' at y
    ift_formula = 1/np.exp(x)  # 1/f'(x)
    print(f'  x={x}: (ln)\'(e^x)={inv_prime_at_y:.6f}, 1/e^x={ift_formula:.6f}')

18. Mean Value Theorem — Visualization

We visualize the MVT: for $f$ on $[a,b]$ , there exists $c$ where the tangent is parallel to the secant. Applications: monotonicity, Lipschitz bounds.

Code cell 42

# === 18. MVT Visualization ===

import numpy as np

try:
    import matplotlib.pyplot as plt
    HAS_MPL = True
except:
    HAS_MPL = False

def f(x): return np.sin(x) + 0.3*x
def fp(x): return np.cos(x) + 0.3

a, b = 0.5, 4.0
secant_slope = (f(b) - f(a)) / (b - a)

# Find c where f'(c) = secant_slope numerically
x_scan = np.linspace(a, b, 10000)
fp_vals = fp(x_scan)
# c where fp closest to secant_slope
c_idx = np.argmin(abs(fp_vals - secant_slope))
c_mvt = x_scan[c_idx]

print(f'f(x) = sin(x) + 0.3x on [{a}, {b}]')
print(f'Secant slope: {secant_slope:.4f}')
print(f'MVT point c = {c_mvt:.4f}, f\'(c) = {fp(c_mvt):.4f}')

if HAS_MPL:
    x_plot = np.linspace(0, 4.5, 300)
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.plot(x_plot, f(x_plot), 'b-', lw=2, label='f(x)')

    # Secant line
    x_sec = np.linspace(a-0.2, b+0.2, 50)
    y_sec = f(a) + secant_slope*(x_sec - a)
    ax.plot(x_sec, y_sec, 'r-', lw=2, label=f'Secant (slope={secant_slope:.2f})')

    # Tangent at c
    x_tang = np.linspace(c_mvt-0.8, c_mvt+0.8, 50)
    y_tang = f(c_mvt) + fp(c_mvt)*(x_tang - c_mvt)
    ax.plot(x_tang, y_tang, 'g--', lw=2, label=f'Tangent at c={c_mvt:.2f}')

    ax.plot([a, b], [f(a), f(b)], 'ro', ms=8)
    ax.plot(c_mvt, f(c_mvt), 'g^', ms=10, label='MVT point c')
    ax.set_xlabel('x'); ax.set_ylabel('f(x)')
    ax.set_title('Mean Value Theorem')
    ax.legend()
    plt.tight_layout(); plt.show()

19. Derivatives and Taylor Approximation

The first-order Taylor approximation $f(x) \approx f(a) + f'(a)(x-a)$ is the derivative's most direct practical application. We verify approximation quality across multiple functions.

Code cell 44

# === 19. Linear (First-Order Taylor) Approximation ===

import numpy as np

try:
    import matplotlib.pyplot as plt
    HAS_MPL = True
except:
    HAS_MPL = False

def linear_approx(f, fp, a, x):
    return f(a) + fp(a)*(x - a)

functions = [
    ('sin(x)', np.sin, np.cos, np.pi/4),
    ('e^x', np.exp, np.exp, 0.0),
    ('ln(x)', np.log, lambda x: 1/x, 1.0),
    ('sqrt(x)', np.sqrt, lambda x: 1/(2*np.sqrt(x)), 4.0),
]

print('=== Linear Approximation Quality ===')
print(f'{"Function":>12} {"a":>6} {"x-a":>6} {"True f(x)":>12} {"Linear":>12} {"Error":>10}')

for name, f, fp, a in functions:
    for dx in [0.1, 0.5, 1.0]:
        x = a + dx
        true_val = f(x)
        lin_val = linear_approx(f, fp, a, x)
        error = abs(true_val - lin_val)
        print(f'{name:>12} {a:6.2f} {dx:6.2f} {true_val:12.6f} {lin_val:12.6f} {error:10.2e}')

print('\nPattern: error grows as O((x-a)^2) — the quadratic (second-order) term.')
print('The full story is in §04-Series-and-Sequences (Taylor series).')

20. Subgradients and Non-Smooth Functions

For non-differentiable convex functions (ReLU, |x|, L1 norm), subgradients generalize the derivative. Subgradient descent is used for L1 regularization.

Code cell 46

# === 20. Subgradients ===

import numpy as np

try:
    import matplotlib.pyplot as plt
    HAS_MPL = True
except:
    HAS_MPL = False

# Subgradient of |x| at x=0 is any value in [-1,1]
# For optimization: use sign(x), subgradient 0 at x=0 is standard choice

x = np.linspace(-2, 2, 400)
f_abs = np.abs(x)
f_relu = np.maximum(0, x)

# Supporting hyperplanes at x=0 for |x|
x_supp = np.linspace(-1.5, 1.5, 50)
subgrads_to_show = [-1.0, -0.5, 0.0, 0.5, 1.0]

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(13, 5))

    ax = axes[0]
    ax.plot(x, f_abs, 'b-', lw=3, label='|x|', zorder=3)
    colors = plt.cm.plasma(np.linspace(0.1, 0.9, len(subgrads_to_show)))
    for g, c in zip(subgrads_to_show, colors):
        y_supp = g * x_supp  # supporting line: f(0) + g*(x-0) = g*x
        ax.plot(x_supp, y_supp, '--', color=c, alpha=0.7, lw=1.5,
                label=f'g={g:+.1f}')
    ax.set_xlim(-1.5, 1.5); ax.set_ylim(-0.2, 1.5)
    ax.set_xlabel('x'); ax.set_title('Subgradients of |x| at x=0')
    ax.legend(fontsize=9)

    ax2 = axes[1]
    ax2.plot(x, f_relu, 'r-', lw=3, label='ReLU(x)')
    for g, c in zip([0.0, 0.5, 1.0], ['blue','green','orange']):
        y_supp = g * x_supp
        ax2.plot(x_supp, y_supp, '--', color=c, alpha=0.7, label=f'g={g:.1f}')
    ax2.set_xlim(-1.5, 1.5); ax2.set_ylim(-0.3, 1.5)
    ax2.set_xlabel('x'); ax2.set_title('Subgradients of ReLU at x=0')
    ax2.legend()
    plt.tight_layout(); plt.show()

# Soft-thresholding (proximal operator of lambda*|x|)
lam = 0.3
def soft_threshold(v, lam):
    return np.sign(v) * np.maximum(np.abs(v) - lam, 0)

v_test = np.array([-1.0, -0.2, 0.0, 0.15, 0.5, 1.0])
print('=== Soft-Thresholding (Proximal Operator of L1) ===')
print(f'lambda = {lam}')
for v in v_test:
    print(f'  v={v:+.2f} -> ST={soft_threshold(v, lam):+.2f}')

21. Modern Activations — SiLU, Swish, Mish

Post-ReLU activations used in modern transformers. SiLU/Swish = $x \cdot \sigma(x)$ ; Mish = $x \cdot \tanh(\text{softplus}(x))$ .

Code cell 48

# === 21. Modern Activations ===

import numpy as np

try:
    import matplotlib.pyplot as plt
    from scipy.special import erf
    HAS_MPL = True
except:
    HAS_MPL = False

def sigmoid(x): return 1/(1+np.exp(-np.clip(x,-500,500)))

# SiLU / Swish
def swish(x): return x * sigmoid(x)
def swish_prime(x):
    s = sigmoid(x)
    return s + x * s * (1 - s)

# Mish
def mish(x): return x * np.tanh(np.log(1+np.exp(np.clip(x,-500,500))))
def mish_prime(x):
    sp = np.log(1+np.exp(np.clip(x,-500,500)))  # softplus(x)
    ts = np.tanh(sp)
    sig = sigmoid(x)
    return ts + x * (1 - ts**2) * sig

# GELU for comparison
def gelu(x): return x * 0.5 * (1 + erf(x/np.sqrt(2)))
def gelu_prime(x):
    Phi = 0.5 * (1 + erf(x/np.sqrt(2)))
    phi = np.exp(-x**2/2) / np.sqrt(2*np.pi)
    return Phi + x*phi

# Verify derivatives
def centered(f, x, h=1e-6): return (f(x+h) - f(x-h)) / (2*h)

x = 1.5
print('=== Modern Activation Derivative Verification at x=1.5 ===')
for name, fn, fn_prime in [('Swish', swish, swish_prime), ('Mish', mish, mish_prime),
                             ('GELU', gelu, gelu_prime)]:
    a = fn_prime(x); n = centered(fn, x)
    print(f"  {'PASS' if abs(a-n)<1e-5 else 'FAIL'} {name}: analytic={a:.6f}, numerical={n:.6f}")

x_plot = np.linspace(-3, 3, 400)
if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    for fn, nm, c in [(swish,'Swish','b'), (mish,'Mish','r'), (gelu,'GELU','g'),
                       (lambda x: np.maximum(0,x), 'ReLU', 'k')]:
        axes[0].plot(x_plot, fn(x_plot), lw=2, color=c, label=nm)
    axes[0].set_ylim(-0.5, 3); axes[0].set_title('Modern Activations')
    axes[0].set_xlabel('x'); axes[0].legend()

    for fn, nm, c in [(swish_prime,'Swish\'','b'), (mish_prime,'Mish\'','r'),
                       (gelu_prime,'GELU\'','g')]:
        axes[1].plot(x_plot, fn(x_plot), lw=2, color=c, label=nm)
    axes[1].set_ylim(-0.2, 1.2); axes[1].set_title('Activation Derivatives')
    axes[1].set_xlabel('x'); axes[1].legend()
    plt.tight_layout(); plt.show()

22. Gradient Descent — Convergence Analysis

For $L$ -smooth functions, learning rate $\eta < 2/L$ guarantees descent. We verify the descent lemma numerically.

Code cell 50

# === 22. Gradient Descent Convergence ===

import numpy as np

try:
    import matplotlib.pyplot as plt
    HAS_MPL = True
except:
    HAS_MPL = False

# Quadratic: f(theta) = L/2 * theta^2, L-smooth with L = L
L_smooth = 5.0
def f_quad(theta): return (L_smooth/2) * theta**2
def grad_quad(theta): return L_smooth * theta

theta0 = 3.0
steps = 30

print(f'=== Gradient Descent on f(theta) = ({L_smooth}/2)*theta^2 ===')
print(f'L-smooth constant L = {L_smooth}')
print(f'Critical eta: 2/L = {2/L_smooth:.4f}')

learning_rates = [0.1, 0.3, 1/L_smooth, 0.35, 0.4]

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(10, 6))

for eta in learning_rates:
    theta = theta0
    losses = [f_quad(theta)]
    for _ in range(steps):
        theta -= eta * grad_quad(theta)
        losses.append(f_quad(theta))
    converges = losses[-1] < 1e-6
    print(f'  eta={eta:.2f}: final loss={losses[-1]:.2e} ({"converges" if converges else "DIVERGES"})')
    if HAS_MPL:
        ax.semilogy(losses, lw=2, label=f'eta={eta:.2f}')

if HAS_MPL:
    ax.axhline(1e-6, color='gray', ls=':', label='Convergence threshold')
    ax.set_xlabel('Step'); ax.set_ylabel('Loss (log scale)')
    ax.set_title(f'GD on Quadratic (L={L_smooth}) — Effect of Learning Rate')
    ax.legend()
    plt.tight_layout(); plt.show()

print(f'\nConclusion: eta >= 2/L = {2/L_smooth:.2f} diverges; eta < 2/L converges.')

23. Score Function and Fisher Information

The derivative of the log-likelihood is the score. Its variance is the Fisher information, which connects calculus to statistical estimation and second-order optimization.

Code cell 52

# === 23. Score Function and Fisher Information ===

import numpy as np

# Gaussian family: p(x; mu) = (1/sqrt(2pi)) * exp(-(x-mu)^2/2)
# log p = const - (x-mu)^2/2
# score = d/dmu log p = x - mu
# Fisher info: E[(score)^2] = E[(x-mu)^2] = Var[X] = 1 (for standard normal)

np.random.seed(42)
mu_true = 2.0
n_samples = 10000
X = np.random.normal(mu_true, 1.0, n_samples)

# Compute score and Fisher info empirically
def score(x, mu): return x - mu
def log_likelihood(X, mu): return -0.5*np.sum((X - mu)**2) - len(X)*0.5*np.log(2*np.pi)

# MLE: dL/dmu = 0 => sum(x_i - mu) = 0 => mu = xbar
mu_mle = np.mean(X)

# Score at MLE should be ~0 (optimality condition)
score_at_mle = np.mean(score(X, mu_mle))

# Fisher info: E[(x-mu)^2] should be Var[X] = 1
fisher_empirical = np.mean(score(X, mu_true)**2)

print('=== Score Function and Fisher Information ===')
print(f'True mu: {mu_true}, MLE mu_hat: {mu_mle:.4f}')
print(f'Score at MLE (should be ~0): {score_at_mle:.6f}')
print(f'Empirical Fisher info: {fisher_empirical:.4f} (true: 1.0)')

# Fisher-Rao connection: Fisher info = -E[d^2 log p/dmu^2]
# d^2/dmu^2 log p = -1 for Gaussian, so E[-(-1)] = 1
fisher_via_hessian = -np.mean([-1.0 for _ in X])  # = 1
print(f'Fisher via Hessian: {fisher_via_hessian:.4f} (Fisher identity: I = -E[f\'\'])')

# Cramer-Rao bound: Var[mu_hat] >= 1/I(mu)
print(f'\nCramer-Rao bound: Var[MLE] >= 1/I = {1/fisher_empirical:.4f}')
print(f'Actual Var[X_bar] = 1/n = {1/n_samples:.4f} (achieves bound — MLE is efficient)')

Summary

This notebook covered the full theory of differentiation:

Section	Key Result
§1 Intuition	Secant → tangent line as $h\to 0$
§2 Definition	$f'(a) = \lim_{h\to 0}[f(a+h)-f(a)]/h$
§3 Rules	Power, product, quotient, chain, elementary functions
§4 Chain Rule	$(f\circ g)' = f'(g)\cdot g'$ — backbone of backprop
§5 Implicit	$dy/dx = -F_x/F_y$ ; linear approximation
§6 Second Derivative	Concavity, inflection, $f''$ at critical points
§7 Extrema / MVT	First/second derivative tests; Lipschitz bounds
§8 Activations	$\sigma'=\sigma(1-\sigma)$ , ReLU', GELU'; vanishing gradients
§9 Numerical Diff	Forward $O(h)$ , centered $O(h^2)$ , optimal $h\approx 10^{-5.3}$
§10 Proofs	Product rule, chain rule, differentiable→continuous
§11 Adam	First/second moment estimates; adaptive learning rates
§12 Backprop	Full 2-layer network chain rule derivation and verification
§13 Softmax+CE	$\partial\mathcal{L}/\partial z_j = p_j - y_j$
§14 Vanishing Grad	Exponential decay through sigmoid layers; ReLU advantage
§15 Stability	Numerically stable sigmoid, log-sigmoid, softmax

Next: Integration →

Derivatives and Differentiation

Theory Notebook

Derivatives and Differentiation

Code cell 2

Code cell 3

1. Intuition — The Derivative as Instantaneous Rate of Change

Code cell 5

2. Formal Definition — Derivative as a Limit

Code cell 7

Code cell 8

3. Basic Differentiation Rules

Code cell 10

4. The Chain Rule

Code cell 12

5. Implicit Differentiation and Linear Approximation

Code cell 14

6. Higher-Order Derivatives and Concavity

Code cell 16

7. Extrema, Critical Points, and the Mean Value Theorem

Code cell 18

8. Activation Function Derivatives

Code cell 20

Code cell 21

9. Numerical Differentiation

Code cell 23

Code cell 24

10. Key Proofs — Interactive Verification

Code cell 26

11. Application — Adam Optimizer Analysis

Code cell 28

12. Backpropagation — Chain Rule on Computation Graphs

Code cell 30

13. Softmax and Cross-Entropy Gradient

Code cell 32

14. Vanishing Gradients — Quantitative Analysis

Code cell 34

15. Numerically Stable Derivative Computations

Code cell 36

16. Logarithmic Differentiation

Code cell 38

17. Inverse Function Theorem

Code cell 40

18. Mean Value Theorem — Visualization

Code cell 42

19. Derivatives and Taylor Approximation

Code cell 44

20. Subgradients and Non-Smooth Functions

Code cell 46

21. Modern Activations — SiLU, Swish, Mish

Code cell 48

22. Gradient Descent — Convergence Analysis

Code cell 50

23. Score Function and Fisher Information

Code cell 52

Summary

Test this lesson

Which module does this lesson belong to?

Which section is covered in this lesson content?

Which term is most central to this lesson?

What is the best way to use this lesson for real learning?