Theory Notebook

Converted from theory.ipynb for web reading.

Limits and Continuity - Examples

This notebook demonstrates limits and continuity concepts with practical examples and visualizations.

Topics Covered

Limit Intuition (Numerical Approach)
One-Sided Limits
Fundamental Limits
L'Hôpital's Rule
Limits at Infinity
Continuity Concepts
Squeeze Theorem
Softmax Temperature Limit (ML)
Sigmoid Saturation (ML)
Learning Rate Decay (ML)
Numerical Stability

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import numpy.linalg as la
from scipy import integrate, special, stats
from math import factorial
import matplotlib.patches as patches

COLORS = {
    "primary": "#0077BB",
    "secondary": "#EE7733",
    "tertiary": "#009988",
    "error": "#CC3311",
    "neutral": "#555555",
    "highlight": "#EE3377",
}
HAS_MPL = True
np.set_printoptions(precision=8, suppress=True)
np.random.seed(42)

def header(title):
    print("\n" + "=" * len(title))
    print(title)
    print("=" * len(title))

def check_true(name, cond):
    ok = bool(cond)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    return ok

def check_close(name, got, expected, tol=1e-8):
    ok = np.allclose(got, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}: got {got}, expected {expected}")
    return ok



def centered_diff(f, x, h=1e-6):
    return (f(x + h) - f(x - h)) / (2 * h)

def forward_diff(f, x, h=1e-6):
    return (f(x + h) - f(x)) / h

def backward_diff(f, x, h=1e-6):
    return (f(x) - f(x - h)) / h



def grad_check(f, x, analytic_grad, h=1e-6):
    x = np.asarray(x, dtype=float)
    analytic_grad = np.asarray(analytic_grad, dtype=float)
    numeric_grad = np.zeros_like(x, dtype=float)
    for idx in np.ndindex(x.shape):
        x_plus = x.copy(); x_minus = x.copy()
        x_plus[idx] += h; x_minus[idx] -= h
        numeric_grad[idx] = (f(x_plus) - f(x_minus)) / (2 * h)
    denom = la.norm(analytic_grad) + la.norm(numeric_grad) + 1e-12
    return la.norm(analytic_grad - numeric_grad) / denom



def check(name, got, expected, tol=1e-8):
    return check_close(name, got, expected, tol=tol)

print("Chapter helper setup complete.")

1. Limit Intuition (Numerical Approach)

We approach limits numerically by evaluating the function at points closer and closer to the target value.

Code cell 5

print("Evaluate: lim(x→2) (x² - 4)/(x - 2)")
print("="*50)

def f(x):
    return (x**2 - 4) / (x - 2)

# Approach from left
print("\nApproaching from the LEFT (x < 2):")
left_vals = [1.9, 1.99, 1.999, 1.9999, 1.99999]
for x in left_vals:
    print(f"  f({x}) = {f(x):.8f}")

# Approach from right
print("\nApproaching from the RIGHT (x > 2):")
right_vals = [2.1, 2.01, 2.001, 2.0001, 2.00001]
for x in right_vals:
    print(f"  f({x}) = {f(x):.8f}")

print("\n" + "="*50)
print("Both sides approach 4!")
print("\nAlgebraic verification:")
print("(x² - 4)/(x - 2) = (x-2)(x+2)/(x-2) = x + 2")
print("lim(x→2) (x + 2) = 4 ✓")

Code cell 6

# Visualize the limit
fig, ax = plt.subplots(figsize=(10, 6))

# Plot the function (avoiding x=2)
x_left = np.linspace(0, 1.99, 100)
x_right = np.linspace(2.01, 4, 100)

ax.plot(x_left, f(x_left), 'b-', linewidth=2, label=r'$f(x) = \frac{x^2-4}{x-2}$')
ax.plot(x_right, f(x_right), 'b-', linewidth=2)

# Mark the hole at x=2
ax.plot(2, 4, 'wo', markersize=10, markeredgecolor='blue', markeredgewidth=2)

# Add arrows showing approach
ax.annotate('', xy=(2, 4), xytext=(1.5, 3.5),
            arrowprops=dict(arrowstyle='->', color='red', lw=2))
ax.annotate('', xy=(2, 4), xytext=(2.5, 4.5),
            arrowprops=dict(arrowstyle='->', color='red', lw=2))

ax.axhline(y=4, color='gray', linestyle='--', alpha=0.5, label='Limit = 4')
ax.axvline(x=2, color='gray', linestyle='--', alpha=0.5)

ax.set_xlabel('x', fontsize=12)
ax.set_ylabel('f(x)', fontsize=12)
ax.set_title(r'Limit: $\lim_{x \to 2} \frac{x^2-4}{x-2} = 4$', fontsize=14)
ax.legend()
ax.set_xlim(0, 4)
ax.set_ylim(0, 6)
plt.tight_layout()
plt.show()

2. One-Sided Limits

Sometimes the left-hand and right-hand limits are different. When they disagree, the two-sided limit does not exist.

Code cell 8

print("Consider f(x) = |x|/x (the sign function)")
print("="*50)

def sign_func(x):
    return np.sign(x)

# From left (approaching 0 from negative side)
print("\nFrom the LEFT (x → 0⁻):")
for x in [-0.1, -0.01, -0.001, -0.0001]:
    print(f"  f({x}) = {sign_func(x)}")
print("  Left-hand limit = -1")

# From right (approaching 0 from positive side)
print("\nFrom the RIGHT (x → 0⁺):")
for x in [0.1, 0.01, 0.001, 0.0001]:
    print(f"  f({x}) = {sign_func(x)}")
print("  Right-hand limit = +1")

print("\n" + "="*50)
print("Since left limit ≠ right limit,")
print("lim(x→0) |x|/x does NOT exist!")

Code cell 9

# Visualize one-sided limits
fig, ax = plt.subplots(figsize=(10, 6))

x_neg = np.linspace(-2, -0.01, 100)
x_pos = np.linspace(0.01, 2, 100)

ax.plot(x_neg, sign_func(x_neg), 'b-', linewidth=2, label=r'$f(x) = |x|/x$')
ax.plot(x_pos, sign_func(x_pos), 'b-', linewidth=2)

# Mark the discontinuity
ax.plot(0, -1, 'wo', markersize=10, markeredgecolor='blue', markeredgewidth=2)
ax.plot(0, 1, 'wo', markersize=10, markeredgecolor='blue', markeredgewidth=2)

ax.axhline(y=0, color='gray', linestyle='-', alpha=0.3)
ax.axvline(x=0, color='gray', linestyle='-', alpha=0.3)

# Add annotations
ax.annotate('Right limit = +1', xy=(0.1, 1), xytext=(0.5, 1.3),
            fontsize=11, arrowprops=dict(arrowstyle='->', color='green'))
ax.annotate('Left limit = -1', xy=(-0.1, -1), xytext=(-0.5, -1.3),
            fontsize=11, arrowprops=dict(arrowstyle='->', color='red'))

ax.set_xlabel('x', fontsize=12)
ax.set_ylabel('f(x)', fontsize=12)
ax.set_title('One-Sided Limits: Sign Function', fontsize=14)
ax.legend(loc='upper right')
ax.set_xlim(-2, 2)
ax.set_ylim(-1.5, 1.5)
plt.tight_layout()
plt.show()

3. Fundamental Limits

These are essential limits that appear frequently in calculus and analysis.

Code cell 11

print("FUNDAMENTAL LIMITS")
print("="*60)

# Limit 1: sin(x)/x
print("\n1. lim(x→0) sin(x)/x = 1")
print("-" * 40)
for x in [0.1, 0.01, 0.001, 0.0001, 0.00001]:
    val = np.sin(x) / x
    print(f"   x = {x:.5f}: sin(x)/x = {val:.12f}")

# Limit 2: (e^x - 1)/x
print("\n2. lim(x→0) (eˣ - 1)/x = 1")
print("-" * 40)
for x in [0.1, 0.01, 0.001, 0.0001, 0.00001]:
    val = (np.exp(x) - 1) / x
    print(f"   x = {x:.5f}: (eˣ - 1)/x = {val:.12f}")

# Limit 3: (1 + 1/n)^n → e
print("\n3. lim(n→∞) (1 + 1/n)ⁿ = e")
print("-" * 40)
for n in [10, 100, 1000, 10000, 100000]:
    val = (1 + 1/n)**n
    error = abs(val - np.e)
    print(f"   n = {n:6d}: (1 + 1/n)ⁿ = {val:.12f}  (error = {error:.2e})")
print(f"   True e = {np.e:.12f}")

Code cell 12

# Visualize fundamental limits
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# sin(x)/x
x = np.linspace(-10, 10, 1000)
x = x[x != 0]  # Avoid division by zero
y = np.sin(x) / x
axes[0].plot(x, y, 'b-', linewidth=2)
axes[0].axhline(y=1, color='r', linestyle='--', label='y = 1')
axes[0].set_xlabel('x')
axes[0].set_ylabel('sin(x)/x')
axes[0].set_title(r'$\lim_{x \to 0} \frac{\sin x}{x} = 1$')
axes[0].legend()
axes[0].set_ylim(-0.5, 1.2)

# (e^x - 1)/x
x = np.linspace(-2, 2, 1000)
x = x[np.abs(x) > 0.001]
y = (np.exp(x) - 1) / x
axes[1].plot(x, y, 'b-', linewidth=2)
axes[1].axhline(y=1, color='r', linestyle='--', label='y = 1')
axes[1].set_xlabel('x')
axes[1].set_ylabel(r'$(e^x - 1)/x$')
axes[1].set_title(r'$\lim_{x \to 0} \frac{e^x - 1}{x} = 1$')
axes[1].legend()
axes[1].set_ylim(-0.5, 4)

# (1 + 1/n)^n
n = np.arange(1, 101)
y = (1 + 1/n)**n
axes[2].plot(n, y, 'b-', linewidth=2)
axes[2].axhline(y=np.e, color='r', linestyle='--', label=f'y = e ≈ {np.e:.4f}')
axes[2].set_xlabel('n')
axes[2].set_ylabel(r'$(1 + 1/n)^n$')
axes[2].set_title(r'$\lim_{n \to \infty} (1 + 1/n)^n = e$')
axes[2].legend()

plt.tight_layout()
plt.show()

4. L'Hôpital's Rule

For indeterminate forms $\frac{0}{0}$ or $\frac{\infty}{\infty}$ :

\lim_{x \to a} \frac{f(x)}{g(x)} = \lim_{x \to a} \frac{f'(x)}{g'(x)}

Code cell 14

print("L'HÔPITAL'S RULE EXAMPLE")
print("="*60)
print("\nFind: lim(x→0) (eˣ - 1 - x)/x²")

print("\n--- Step 1: Check indeterminate form ---")
print("At x = 0: numerator = e⁰ - 1 - 0 = 0")
print("At x = 0: denominator = 0² = 0")
print("Form: 0/0 → Apply L'Hôpital")

print("\n--- Step 2: First application ---")
print("f(x) = eˣ - 1 - x  →  f'(x) = eˣ - 1")
print("g(x) = x²          →  g'(x) = 2x")
print("\nlim(x→0) (eˣ - 1)/(2x) → still 0/0!")

print("\n--- Step 3: Second application ---")
print("f'(x) = eˣ - 1  →  f''(x) = eˣ")
print("g'(x) = 2x      →  g''(x) = 2")
print("\nlim(x→0) eˣ/2 = e⁰/2 = 1/2")

print("\n--- Numerical Verification ---")
def f(x):
    return (np.exp(x) - 1 - x) / x**2

for x in [0.1, 0.01, 0.001, 0.0001, 0.00001]:
    print(f"   f({x}) = {f(x):.12f}")
print("\n   Approaches 0.5 ✓")

5. Limits at Infinity

For polynomial ratios, the behavior depends on the degrees of the polynomials.

Code cell 16

print("LIMITS AT INFINITY")
print("="*60)

# Case 1: Equal degrees
print("\n1. lim(x→∞) (2x² + 3x)/(x² - 1)")
print("   Equal degrees → ratio of leading coefficients")
print("   = 2/1 = 2")

def f1(x):
    return (2*x**2 + 3*x) / (x**2 - 1)

print("\n   Numerical verification:")
for x in [10, 100, 1000, 10000, 100000]:
    print(f"     f({x:6d}) = {f1(x):.10f}")

# Case 2: Numerator degree < Denominator degree
print("\n2. lim(x→∞) x/(x² + 1)")
print("   Numerator degree < Denominator degree → limit is 0")

def f2(x):
    return x / (x**2 + 1)

print("\n   Numerical verification:")
for x in [10, 100, 1000, 10000, 100000]:
    print(f"     f({x:6d}) = {f2(x):.10f}")

# Case 3: Numerator degree > Denominator degree
print("\n3. lim(x→∞) x³/(x² + 1)")
print("   Numerator degree > Denominator degree → limit is ±∞")

def f3(x):
    return x**3 / (x**2 + 1)

print("\n   Numerical verification:")
for x in [10, 100, 1000, 10000]:
    print(f"     f({x:5d}) = {f3(x):.4f}")

6. Continuity Concepts

A function is continuous at $a$ if:

$f(a)$ is defined
$\lim_{x \to a} f(x)$ exists
$\lim_{x \to a} f(x) = f(a)$

Code cell 18

print("TYPES OF DISCONTINUITIES")
print("="*60)

print("\n1. REMOVABLE DISCONTINUITY (hole)")
print("   f(x) = (x² - 4)/(x + 2) at x = -2")
print("   f(-2) is undefined (0/0)")
print("   But: f(x) = (x-2)(x+2)/(x+2) = x - 2")
print("   lim(x→-2) = -4 exists")
print("   Can be 'fixed' by defining f(-2) = -4")

print("\n2. JUMP DISCONTINUITY")
print("   f(x) = |x|/x at x = 0")
print("   Left limit = -1, Right limit = +1")
print("   Limits exist but are different")

print("\n3. INFINITE DISCONTINUITY (vertical asymptote)")
print("   f(x) = 1/(x² - 1) at x = ±1")
print("   lim(x→1) = ±∞")
print("   Limit does not exist (infinite)")

Code cell 19

# Visualize types of discontinuities
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Removable discontinuity
x1 = np.linspace(-5, 5, 1000)
x1 = x1[np.abs(x1 + 2) > 0.05]
y1 = (x1**2 - 4) / (x1 + 2)
axes[0].plot(x1, y1, 'b-', linewidth=2)
axes[0].plot(-2, -4, 'wo', markersize=10, markeredgecolor='blue', markeredgewidth=2)
axes[0].set_xlabel('x')
axes[0].set_ylabel('f(x)')
axes[0].set_title('Removable (hole at x = -2)')
axes[0].set_xlim(-5, 3)
axes[0].set_ylim(-6, 2)

# Jump discontinuity
x2_neg = np.linspace(-2, -0.01, 100)
x2_pos = np.linspace(0.01, 2, 100)
axes[1].plot(x2_neg, np.sign(x2_neg), 'b-', linewidth=2)
axes[1].plot(x2_pos, np.sign(x2_pos), 'b-', linewidth=2)
axes[1].plot(0, -1, 'bo', markersize=8, fillstyle='none', markeredgewidth=2)
axes[1].plot(0, 1, 'bo', markersize=8, fillstyle='none', markeredgewidth=2)
axes[1].set_xlabel('x')
axes[1].set_ylabel('f(x)')
axes[1].set_title('Jump (at x = 0)')

# Infinite discontinuity
x3 = np.linspace(-3, 3, 1000)
mask = (np.abs(x3 - 1) > 0.1) & (np.abs(x3 + 1) > 0.1)
x3_clean = x3[mask]
y3 = 1 / (x3_clean**2 - 1)
axes[2].plot(x3_clean, y3, 'b-', linewidth=2)
axes[2].axvline(x=-1, color='gray', linestyle='--', alpha=0.5)
axes[2].axvline(x=1, color='gray', linestyle='--', alpha=0.5)
axes[2].set_xlabel('x')
axes[2].set_ylabel('f(x)')
axes[2].set_title('Infinite (asymptotes at x = ±1)')
axes[2].set_ylim(-5, 5)

plt.tight_layout()
plt.show()

7. Squeeze Theorem

If $g(x) \leq f(x) \leq h(x)$ near $a$ and $\lim_{x \to a} g(x) = \lim_{x \to a} h(x) = L$ , then $\lim_{x \to a} f(x) = L$ .

Code cell 21

print("SQUEEZE THEOREM EXAMPLE")
print("="*60)
print("\nFind: lim(x→0) x² sin(1/x)")

print("\n--- Key insight ---")
print("-1 ≤ sin(1/x) ≤ 1 for all x ≠ 0")
print("\nMultiply by x² (positive for x ≠ 0):")
print("-x² ≤ x² sin(1/x) ≤ x²")

print("\n--- Apply squeeze ---")
print("lim(x→0) (-x²) = 0")
print("lim(x→0) (x²) = 0")
print("\nBy Squeeze Theorem: lim(x→0) x² sin(1/x) = 0")

print("\n--- Numerical verification ---")
def f(x):
    return x**2 * np.sin(1/x)

for x in [0.1, 0.01, 0.001, 0.0001]:
    val = f(x)
    bound = x**2
    print(f"  x = {x}: f(x) = {val:12.6e}, bounds = ±{bound:.6e}")

Code cell 22

# Visualize squeeze theorem
fig, ax = plt.subplots(figsize=(12, 6))

x = np.linspace(0.01, 1, 1000)
y = x**2 * np.sin(1/x)
upper = x**2
lower = -x**2

ax.fill_between(x, lower, upper, alpha=0.3, color='green', label='Squeeze bounds')
ax.plot(x, y, 'b-', linewidth=2, label=r'$x^2 \sin(1/x)$')
ax.plot(x, upper, 'r--', linewidth=1.5, label=r'$x^2$')
ax.plot(x, lower, 'r--', linewidth=1.5, label=r'$-x^2$')

ax.axhline(y=0, color='gray', linestyle='-', alpha=0.3)
ax.axvline(x=0, color='gray', linestyle='-', alpha=0.3)

ax.set_xlabel('x', fontsize=12)
ax.set_ylabel('f(x)', fontsize=12)
ax.set_title(r'Squeeze Theorem: $\lim_{x \to 0} x^2 \sin(1/x) = 0$', fontsize=14)
ax.legend(loc='upper right')
ax.set_xlim(0, 1)
ax.set_ylim(-0.3, 0.3)
plt.tight_layout()
plt.show()

8. Softmax Temperature Limit (ML Application)

The softmax function with temperature parameter $T$ :

\text{softmax}(z_i/T) = \frac{e^{z_i/T}}{\sum_j e^{z_j/T}}

As $T \to 0$ : becomes "hard" max (all probability on largest logit)
As $T \to \infty$ : becomes uniform distribution

Code cell 24

print("SOFTMAX TEMPERATURE LIMIT")
print("="*60)

def softmax(z, T=1.0):
    """Compute softmax with temperature."""
    z_scaled = z / T
    # Subtract max for numerical stability
    exp_z = np.exp(z_scaled - np.max(z_scaled))
    return exp_z / np.sum(exp_z)

z = np.array([1.0, 2.0, 3.0])
print(f"\nLogits z = {z}")

print("\nSoftmax at different temperatures:")
print("-" * 50)

temperatures = [100.0, 10.0, 1.0, 0.5, 0.1, 0.01]
for T in temperatures:
    probs = softmax(z, T)
    print(f"  T = {T:6.2f}: {np.round(probs, 5)}")

print("\n" + "="*60)
print("Observations:")
print("  T → ∞: Softmax → uniform [0.333, 0.333, 0.333]")
print("  T → 0: Softmax → hard max [0, 0, 1]")

Code cell 25

# Visualize softmax temperature effect
fig, ax = plt.subplots(figsize=(12, 6))

z = np.array([1.0, 2.0, 3.0])
temps = np.logspace(-2, 2, 100)

probs = np.array([softmax(z, T) for T in temps])

for i, label in enumerate(['z₁ = 1', 'z₂ = 2', 'z₃ = 3']):
    ax.semilogx(temps, probs[:, i], linewidth=2, label=label)

ax.axhline(y=1/3, color='gray', linestyle='--', alpha=0.5, label='Uniform')
ax.axhline(y=1, color='gray', linestyle=':', alpha=0.5)
ax.axhline(y=0, color='gray', linestyle=':', alpha=0.5)

ax.set_xlabel('Temperature T', fontsize=12)
ax.set_ylabel('Probability', fontsize=12)
ax.set_title('Softmax Temperature Effect', fontsize=14)
ax.legend()
ax.set_ylim(-0.05, 1.05)

# Add annotations
ax.annotate('T→0: Hard max', xy=(0.01, 0.95), fontsize=10)
ax.annotate('T→∞: Uniform', xy=(50, 0.38), fontsize=10)

plt.tight_layout()
plt.show()

9. Sigmoid Saturation (ML Application)

The sigmoid function:

\sigma(x) = \frac{1}{1 + e^{-x}}

Limits:

$\lim_{x \to +\infty} \sigma(x) = 1$
$\lim_{x \to -\infty} \sigma(x) = 0$

Code cell 27

print("SIGMOID SATURATION")
print("="*60)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    s = sigmoid(x)
    return s * (1 - s)

print("\nσ(x) = 1/(1 + e^(-x))")

print("\nAs x → +∞:")
for x in [1, 5, 10, 20, 50]:
    print(f"  σ({x:2d}) = {sigmoid(x):.12f}  σ'({x:2d}) = {sigmoid_derivative(x):.2e}")
print("  → 1")

print("\nAs x → -∞:")
for x in [-1, -5, -10, -20, -50]:
    print(f"  σ({x:3d}) = {sigmoid(x):.12e}  σ'({x:3d}) = {sigmoid_derivative(x):.2e}")
print("  → 0")

print("\n" + "="*60)
print("KEY INSIGHT for ML:")
print("At saturation: σ'(x) ≈ 0 → Vanishing gradient problem!")

Code cell 28

# Visualize sigmoid and its derivative
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

x = np.linspace(-8, 8, 1000)
y = sigmoid(x)
dy = sigmoid_derivative(x)

# Sigmoid
axes[0].plot(x, y, 'b-', linewidth=2, label='σ(x)')
axes[0].axhline(y=1, color='r', linestyle='--', alpha=0.5, label='y = 1 (limit)')
axes[0].axhline(y=0, color='r', linestyle='--', alpha=0.5, label='y = 0 (limit)')
axes[0].axhline(y=0.5, color='gray', linestyle=':', alpha=0.5)
axes[0].fill_between(x[x > 5], 0, 1, alpha=0.2, color='red', label='Saturation zone')
axes[0].fill_between(x[x < -5], 0, 1, alpha=0.2, color='red')
axes[0].set_xlabel('x', fontsize=12)
axes[0].set_ylabel('σ(x)', fontsize=12)
axes[0].set_title('Sigmoid Function', fontsize=14)
axes[0].legend()

# Derivative
axes[1].plot(x, dy, 'g-', linewidth=2, label="σ'(x) = σ(x)(1-σ(x))")
axes[1].fill_between(x[x > 5], 0, dy[x > 5], alpha=0.3, color='red', label='Vanishing gradient')
axes[1].fill_between(x[x < -5], 0, dy[x < -5], alpha=0.3, color='red')
axes[1].axhline(y=0, color='gray', linestyle='-', alpha=0.3)
axes[1].set_xlabel('x', fontsize=12)
axes[1].set_ylabel("σ'(x)", fontsize=12)
axes[1].set_title('Sigmoid Derivative (Vanishing at Saturation)', fontsize=14)
axes[1].legend()

plt.tight_layout()
plt.show()

10. Learning Rate Decay (ML Application)

For convergence of SGD, we need:

$\sum_t \alpha_t = \infty$ (can reach any point)
$\sum_t \alpha_t^2 < \infty$ (variance goes to zero)

Code cell 30

print("LEARNING RATE DECAY CONVERGENCE")
print("="*60)

print("\nFor SGD convergence, we need:")
print("  1. Σ αₜ = ∞      (can explore the whole space)")
print("  2. Σ αₜ² < ∞     (noise vanishes in the limit)")

print("\n--- Example: αₜ = 1/t ---")

# Check condition 1: harmonic series diverges
print("\nCondition 1: Σ(1/t)")
for n in [10, 100, 1000, 10000, 100000]:
    harmonic = sum(1/t for t in range(1, n+1))
    print(f"  Σ(t=1 to {n:6d}) 1/t = {harmonic:.4f}")
print("  → ∞ (harmonic series diverges) ✓")

# Check condition 2: sum of squares converges
print("\nCondition 2: Σ(1/t²)")
for n in [10, 100, 1000, 10000, 100000]:
    sum_sq = sum(1/t**2 for t in range(1, n+1))
    print(f"  Σ(t=1 to {n:6d}) 1/t² = {sum_sq:.8f}")
print(f"  → π²/6 ≈ {np.pi**2/6:.8f} (converges) ✓")

Code cell 31

# Visualize learning rate decay
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

t = np.arange(1, 101)

# Learning rate decay
alpha = 1 / t
axes[0].plot(t, alpha, 'b-', linewidth=2)
axes[0].set_xlabel('Iteration t', fontsize=12)
axes[0].set_ylabel('αₜ = 1/t', fontsize=12)
axes[0].set_title('Learning Rate Decay', fontsize=14)

# Cumulative sum (diverges)
cumsum = np.cumsum(alpha)
axes[1].plot(t, cumsum, 'g-', linewidth=2)
axes[1].set_xlabel('n', fontsize=12)
axes[1].set_ylabel('Σαₜ', fontsize=12)
axes[1].set_title('Σ(1/t) → ∞ (diverges)', fontsize=14)

# Cumulative sum of squares (converges)
cumsum_sq = np.cumsum(alpha**2)
axes[2].plot(t, cumsum_sq, 'r-', linewidth=2, label='Σ(1/t²)')
axes[2].axhline(y=np.pi**2/6, color='gray', linestyle='--', label=f'π²/6 ≈ {np.pi**2/6:.4f}')
axes[2].set_xlabel('n', fontsize=12)
axes[2].set_ylabel('Σαₜ²', fontsize=12)
axes[2].set_title('Σ(1/t²) → π²/6 (converges)', fontsize=14)
axes[2].legend()

plt.tight_layout()
plt.show()

11. Numerical Stability

When computing limits numerically, we must be careful about catastrophic cancellation.

Code cell 33

print("NUMERICAL STABILITY")
print("="*60)
print("\nComputing (eˣ - 1)/x as x → 0")
print("The limit is 1, but naive computation fails!")

print("\n--- Naive vs Stable Computation ---")
print(f"{'x':>12} {'Naive':>16} {'Stable':>16} {'Error (naive)':>16}")
print("-" * 64)

for x in [1e-5, 1e-8, 1e-10, 1e-12, 1e-15, 1e-16]:
    naive = (np.exp(x) - 1) / x
    stable = np.expm1(x) / x if x != 0 else 1.0
    error = abs(naive - 1)
    print(f"{x:12.0e} {naive:16.12f} {stable:16.12f} {error:16.2e}")

print("\n" + "="*60)
print("Explanation:")
print("  For x = 1e-16: e^x ≈ 1.0000000000000001")
print("  Subtracting 1 causes catastrophic cancellation!")
print("\nSolution: Use np.expm1(x) which computes e^x - 1 accurately for small x")

Code cell 34

# Another numerical stability example: log1p
print("\nAnother Example: log(1 + x) for small x")
print("="*60)

print(f"{'x':>12} {'Naive log(1+x)':>18} {'Stable log1p(x)':>18}")
print("-" * 52)

for x in [1e-5, 1e-10, 1e-15, 1e-16, 1e-17]:
    naive = np.log(1 + x)
    stable = np.log1p(x)
    print(f"{x:12.0e} {naive:18.15f} {stable:18.15f}")

print("\nFor very small x, log(1+x) ≈ x (from Taylor series)")

12. ε-δ Definition: Interactive Visualization

The formal definition: $\lim_{x \to a} f(x) = L$ means for every $\varepsilon > 0$ there exists $\delta > 0$ such that $0 < \lvert x - a \rvert < \delta \implies \lvert f(x) - L \rvert < \varepsilon$ .

Code cell 36

# === 12.1 epsilon-delta Visualization ===

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches

COLORS = {'primary': '#0077BB', 'secondary': '#EE7733', 'tertiary': '#009988',
          'error': '#CC3311', 'neutral': '#555555', 'highlight': '#EE3377'}

# f(x) = (x^2-4)/(x-2), lim_{x->2} = 4
def f(x):
    return np.where(np.abs(x - 2) > 1e-10, (x**2 - 4)/(x - 2), np.nan)

a, L = 2.0, 4.0
eps = 0.5
delta = eps / 2  # for f(x) = x+2, |f(x)-L| = |x-2| so delta = eps works

fig, ax = plt.subplots(figsize=(10, 7))
x = np.linspace(0.5, 3.5, 1000)
y = f(x)

ax.plot(x, y, color=COLORS['primary'], lw=2.5, label=r'$f(x)=(x^2-4)/(x-2)$')
ax.plot(a, L, 'o', ms=10, color='white', markeredgecolor=COLORS['primary'],
        markeredgewidth=2.5, zorder=5, label=f'Hole at x={a}')

# epsilon band
ax.axhspan(L - eps, L + eps, alpha=0.15, color=COLORS['secondary'],
           label=f'ε-band: ({L-eps:.1f}, {L+eps:.1f})')
ax.axhline(L - eps, color=COLORS['secondary'], ls='--', lw=1.2)
ax.axhline(L + eps, color=COLORS['secondary'], ls='--', lw=1.2)
ax.axhline(L, color=COLORS['neutral'], ls=':', lw=1)

# delta window
ax.axvspan(a - delta, a + delta, alpha=0.12, color=COLORS['tertiary'],
           label=f'δ-window: ({a-delta:.2f}, {a+delta:.2f})')
ax.axvline(a - delta, color=COLORS['tertiary'], ls='--', lw=1.2)
ax.axvline(a + delta, color=COLORS['tertiary'], ls='--', lw=1.2)

# annotations
ax.annotate(f'ε = {eps}', xy=(3.2, L+eps), fontsize=11, color=COLORS['secondary'])
ax.annotate(f'δ = {delta}', xy=(a+delta+0.02, 1.0), fontsize=11, color=COLORS['tertiary'])
ax.annotate(f'L = {L}', xy=(0.6, L+0.1), fontsize=11, color=COLORS['neutral'])

ax.set_xlabel('x', fontsize=13)
ax.set_ylabel('f(x)', fontsize=13)
ax.set_title(r'ε-δ Definition: $\lim_{x\to 2}\frac{x^2-4}{x-2} = 4$', fontsize=14)
ax.legend(fontsize=10)
ax.set_xlim(0.5, 3.5)
ax.set_ylim(1.0, 7.0)
fig.tight_layout()
plt.show()

# Verify: for x in (a-delta, a+delta), f(x) in (L-eps, L+eps)
x_test = np.linspace(a - delta + 0.001, a + delta - 0.001, 1000)
x_test = x_test[np.abs(x_test - a) > 1e-10]  # exclude a itself
f_vals = f(x_test)
f_vals = f_vals[~np.isnan(f_vals)]
all_in_band = np.all((f_vals > L - eps) & (f_vals < L + eps))
print(f'For δ={delta}: all f(x) in ε-band? {all_in_band}')
print(f'f(x) range in δ-window: [{f_vals.min():.4f}, {f_vals.max():.4f}]')
print(f'ε-band: ({L-eps:.4f}, {L+eps:.4f})')
print(f'PASS: ε-δ verified for ε={eps}, δ={delta}' if all_in_band else 'FAIL')

13. Limit Laws: Numerical Verification

If $\lim_{x\to a}f(x)=L$ and $\lim_{x\to a}g(x)=M$ , then:

$\lim(f+g)=L+M$ , $\lim(fg)=LM$ , $\lim(f/g)=L/M$ (if $M\neq 0$ )

Code cell 38

# === 13.1 Limit Laws Verification ===

import numpy as np

# f(x) = x^2 + 1, g(x) = sin(x)/x  near x=0
# lim f(x) = 1, lim g(x) = 1
h_vals = [1e-1, 1e-2, 1e-3, 1e-4, 1e-6, 1e-8]

print('Verifying limit laws at x -> 0')
print('f(x) = x^2 + 1  =>  lim = 1')
print('g(x) = sin(x)/x =>  lim = 1')
print()
print(f'{"h":>10} | {"f(h)":>12} | {"g(h)":>12} | {"f+g":>12} | {"f*g":>12}')
print('-' * 65)

for h in h_vals:
    fh = h**2 + 1
    gh = np.sin(h) / h
    print(f'{h:>10.2e} | {fh:>12.8f} | {gh:>12.8f} | {fh+gh:>12.8f} | {fh*gh:>12.8f}')

print()
print('Predicted by limit laws:')
print(f'  lim(f+g) = 1 + 1 = 2  (observed ~{1 + np.sin(1e-8)/1e-8:.6f} for h=1e-8)')
print(f'  lim(f*g) = 1 * 1 = 1  (observed ~{(1e-16+1)*(np.sin(1e-8)/1e-8):.6f} for h=1e-8)')

# Composition law: lim g(f(x)) = g(lim f(x)) when g is continuous
print()
print('Composition law: lim_{x->0} exp(sin(x)/x - 1) = exp(1-1) = exp(0) = 1')
for h in [1e-2, 1e-4, 1e-6]:
    val = np.exp(np.sin(h)/h - 1)
    print(f'  h={h:.0e}: exp(sin(h)/h - 1) = {val:.10f}')

14. Euler's Number: $(1 + 1/n)^n \to e$

The number $e = \lim_{n\to\infty}(1+1/n)^n \approx 2.71828$ is both the base of the natural exponential and the limit of discrete compounding.

Code cell 40

# === 14.1 Euler's Number as a Limit ===

import numpy as np
import matplotlib.pyplot as plt

COLORS = {'primary': '#0077BB', 'secondary': '#EE7733', 'highlight': '#EE3377'}

ns = np.array([1, 2, 5, 10, 50, 100, 500, 1000, 5000, 10000, 100000, 1000000])
a_n = (1 + 1/ns)**ns
e_true = np.e
errors = np.abs(a_n - e_true)

print('Convergence of (1 + 1/n)^n to e:')
print(f'{"n":>10} | {"(1+1/n)^n":>15} | {"error":>12}')
print('-' * 45)
for n, val, err in zip(ns, a_n, errors):
    print(f'{n:>10} | {val:>15.10f} | {err:>12.2e}')
print(f'True e =   {e_true:.10f}')

# Convergence rate: error ~ 1/(2n) for large n
print()
print('Convergence rate ~ e/(2n):')
for n, err in zip(ns[-5:], errors[-5:]):
    predicted = e_true / (2 * n)
    print(f'  n={n:.0e}: error={err:.2e}, predicted e/(2n)={predicted:.2e}, ratio={err/predicted:.2f}')

# Visualization
ns_fine = np.logspace(0, 6, 300)
a_fine = (1 + 1/ns_fine)**ns_fine

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].semilogx(ns_fine, a_fine, color=COLORS['primary'], lw=2, label=r'$(1+1/n)^n$')
axes[0].axhline(e_true, color=COLORS['highlight'], ls='--', lw=1.5, label=f'e = {e_true:.5f}')
axes[0].set_xlabel('n (log scale)', fontsize=12)
axes[0].set_ylabel(r'$(1+1/n)^n$', fontsize=12)
axes[0].set_title(r'$\lim_{n\to\infty}(1+1/n)^n = e$', fontsize=13)
axes[0].legend()

axes[1].loglog(ns_fine, np.abs((1+1/ns_fine)**ns_fine - e_true),
               color=COLORS['secondary'], lw=2, label='|error|')
axes[1].loglog(ns_fine, e_true/(2*ns_fine), color=COLORS['primary'],
               ls='--', lw=1.5, label='e/(2n) (predicted rate)')
axes[1].set_xlabel('n', fontsize=12)
axes[1].set_ylabel('|error|', fontsize=12)
axes[1].set_title('Convergence rate: O(1/n)', fontsize=13)
axes[1].legend()

fig.tight_layout()
plt.show()
print('PASS: sequence converges to e with rate O(1/n)')

15. Intermediate Value Theorem: Bisection Root Finding

If $f$ is continuous on $[a,b]$ and $f(a)f(b)<0$ , then $f$ has a root in $(a,b)$ . Bisection exploits IVT to find it.

Code cell 42

# === 15.1 IVT and Bisection Method ===

import numpy as np
import matplotlib.pyplot as plt

COLORS = {'primary': '#0077BB', 'secondary': '#EE7733',
          'tertiary': '#009988', 'error': '#CC3311'}

def bisection(f, a, b, tol=1e-10, max_iter=100):
    """Bisection method: IVT guarantees root in (a,b) if f(a)*f(b) < 0."""
    assert f(a) * f(b) < 0, 'f(a) and f(b) must have opposite signs'
    history = []
    for i in range(max_iter):
        m = (a + b) / 2
        history.append({'iter': i, 'a': a, 'b': b, 'm': m, 'f(m)': f(m), 'width': b-a})
        if abs(f(m)) < tol or (b - a) / 2 < tol:
            break
        if f(a) * f(m) < 0:
            b = m
        else:
            a = m
    return m, history

# Find root of f(x) = x^3 - x - 2 (root near x=1.5214)
f = lambda x: x**3 - x - 2
a0, b0 = 1.0, 2.0

print(f'f({a0}) = {f(a0):.4f}, f({b0}) = {f(b0):.4f}')
print(f'Signs differ: {f(a0)*f(b0) < 0} => IVT guarantees a root in ({a0},{b0})')
print()

root, history = bisection(f, a0, b0)

print(f'{"Iter":>5} | {"a":>10} | {"b":>10} | {"m":>12} | {"f(m)":>12} | {"width":>12}')
print('-' * 70)
for h in history[:10]:
    print(f'{h["iter"]:>5} | {h["a"]:>10.6f} | {h["b"]:>10.6f} | {h["m"]:>12.8f} | {h["f(m)"]:>12.2e} | {h["width"]:>12.2e}')
print(f'...')
print(f'Root found: {root:.12f}')
print(f'f(root) = {f(root):.2e}')
print(f'Iterations: {len(history)}')
print(f'Theoretical: log2((b-a)/tol) = {np.log2((b0-a0)/1e-10):.1f} iterations')

# Visualization
x_plot = np.linspace(0.5, 2.5, 300)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(x_plot, f(x_plot), color=COLORS['primary'], lw=2.5)
axes[0].axhline(0, color=COLORS['neutral'] if 'neutral' in COLORS else 'gray', lw=1)
axes[0].plot(root, 0, 's', ms=10, color=COLORS['error'], zorder=5, label=f'Root ≈ {root:.4f}')
axes[0].set_xlabel('x', fontsize=12); axes[0].set_ylabel('f(x)', fontsize=12)
axes[0].set_title(r'$f(x) = x^3 - x - 2$', fontsize=13)
axes[0].legend()

iters = [h['iter'] for h in history]
widths = [h['width'] for h in history]
axes[1].semilogy(iters, widths, color=COLORS['secondary'], lw=2, label='Interval width')
axes[1].semilogy(iters, [(b0-a0)*0.5**i for i in iters], color=COLORS['primary'],
                 ls='--', lw=1.5, label=r'$(b-a)/2^n$ (theory)')
axes[1].set_xlabel('Iteration', fontsize=12)
axes[1].set_ylabel('Interval width (log)', fontsize=12)
axes[1].set_title('Bisection Convergence: O(1/2^n)', fontsize=13)
axes[1].legend()

fig.tight_layout(); plt.show()
assert abs(f(root)) < 1e-9, 'Root not accurate'
print('PASS: bisection converged to root, IVT verified')

16. Asymptotic Growth Hierarchy

As $x \to \infty$ : $\ln x \ll x^p \ll e^x \ll x^x$ . Every polynomial is dominated by every exponential; every logarithm by every power.

Code cell 44

# === 16.1 Asymptotic Growth Comparison ===

import numpy as np
import matplotlib.pyplot as plt

COLORS = {'primary': '#0077BB', 'secondary': '#EE7733',
          'tertiary': '#009988', 'error': '#CC3311', 'highlight': '#EE3377'}

# Show lim_{x->inf} x^n / e^x = 0 for n=1,2,3
print('lim_{x->inf} x^n / e^x = 0 (polynomial dominated by exponential)')
x_vals = np.array([1, 5, 10, 20, 50, 100])
for n in [1, 2, 3]:
    ratios = x_vals**n / np.exp(x_vals)
    print(f'  n={n}: ratios = {[f"{r:.2e}" for r in ratios]}')

print()
print('lim_{x->inf} ln(x) / x^p = 0 for p>0 (log dominated by any power)')
x_vals2 = np.array([10, 100, 1000, 10000])
for p in [0.1, 0.5, 1.0]:
    ratios = np.log(x_vals2) / x_vals2**p
    print(f'  p={p}: ratios = {[f"{r:.4f}" for r in ratios]}')

# Visualization
x = np.linspace(1, 10, 500)
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Panel 1: functions themselves
axes[0].plot(x, np.log(x), color=COLORS['tertiary'], lw=2, label=r'$\ln x$')
axes[0].plot(x, x, color=COLORS['primary'], lw=2, label=r'$x$')
axes[0].plot(x, x**2, color=COLORS['secondary'], lw=2, label=r'$x^2$')
axes[0].plot(x, np.exp(x), color=COLORS['error'], lw=2, label=r'$e^x$')
axes[0].set_ylim(0, 150)
axes[0].set_xlabel('x', fontsize=12); axes[0].set_ylabel('f(x)', fontsize=12)
axes[0].set_title('Growth Hierarchy: $\\ln x \\ll x \\ll x^2 \\ll e^x$', fontsize=13)
axes[0].legend(fontsize=11)

# Panel 2: ratios to show dominance
x2 = np.linspace(1, 20, 500)
axes[1].semilogy(x2, x2 / np.exp(x2), color=COLORS['primary'], lw=2, label=r'$x/e^x \to 0$')
axes[1].semilogy(x2, x2**2 / np.exp(x2), color=COLORS['secondary'], lw=2, label=r'$x^2/e^x \to 0$')
axes[1].semilogy(x2, np.log(x2) / x2, color=COLORS['tertiary'], lw=2, label=r'$\ln x/x \to 0$')
axes[1].set_xlabel('x', fontsize=12)
axes[1].set_ylabel('Ratio (log scale)', fontsize=12)
axes[1].set_title('All ratios $\\to 0$: dominance hierarchy', fontsize=13)
axes[1].legend(fontsize=11)

fig.tight_layout(); plt.show()
print('PASS: asymptotic hierarchy verified numerically')

17. Gradient as a Limit: Numerical Differentiation and Gradient Checking

The derivative $f'(a) = \lim_{h\to 0}[f(a+h)-f(a)]/h$ can be approximated numerically. Gradient checking verifies automatic differentiation implementations.

Code cell 46

# === 17.1 Finite Differences and Gradient Checking ===

import numpy as np

# Demonstrate one-sided vs centered finite differences
f = lambda x: x**3 + 2*x - 1  # f'(x) = 3x^2 + 2
f_prime = lambda x: 3*x**2 + 2
a = 2.0  # true derivative at a=2: 3*4+2 = 14

h_vals = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-10, 1e-12, 1e-14]
true_val = f_prime(a)

print(f'True f\'({a}) = {true_val}')
print()
print(f'{"h":>10} | {"one-sided err":>15} | {"centered err":>15}')
print('-' * 50)

one_sided_errs = []
centered_errs = []
for h in h_vals:
    one_sided = (f(a + h) - f(a)) / h
    centered = (f(a + h) - f(a - h)) / (2*h)
    err1 = abs(one_sided - true_val)
    errc = abs(centered - true_val)
    one_sided_errs.append(err1)
    centered_errs.append(errc)
    print(f'{h:>10.2e} | {err1:>15.2e} | {errc:>15.2e}')

optimal_h1 = h_vals[np.argmin(one_sided_errs)]
optimal_hc = h_vals[np.argmin(centered_errs)]
print(f'\nOptimal h (one-sided): {optimal_h1:.0e}')
print(f'Optimal h (centered):  {optimal_hc:.0e}')
print(f'Machine eps sqrt: {np.sqrt(np.finfo(float).eps):.2e} (expected for one-sided)')
print(f'Machine eps cube-root: {np.finfo(float).eps**(1/3):.2e} (expected for centered)')

# Gradient check for a neural network-like loss
print()
print('=== Gradient Check ===')
np.random.seed(42)
theta = np.array([1.0, -0.5, 2.0])  # parameters

def loss(t):  # toy loss: ||Wt||^2 with W fixed
    W = np.array([[1, 2, -1], [0, 1, 3]])
    z = W @ t
    return 0.5 * np.dot(z, z)

def grad_analytic(t):  # W^T W t
    W = np.array([[1, 2, -1], [0, 1, 3]], dtype=float)
    return W.T @ (W @ t)

h = 1e-5
grad_fd = np.zeros(3)
for i in range(3):
    tp = theta.copy(); tp[i] += h
    tm = theta.copy(); tm[i] -= h
    grad_fd[i] = (loss(tp) - loss(tm)) / (2*h)

grad_an = grad_analytic(theta)
rel_err = np.linalg.norm(grad_an - grad_fd) / (np.linalg.norm(grad_an) + np.linalg.norm(grad_fd))
print(f'Analytic gradient: {grad_an}')
print(f'FD gradient:       {grad_fd}')
print(f'Relative error:    {rel_err:.2e}')
print('PASS: gradient check passed' if rel_err < 1e-5 else 'FAIL')

18. ReLU and GELU: Continuity and Corner Behavior

ReLU is continuous but not differentiable at 0. GELU ( $x\Phi(x)$ where $\Phi$ is the normal CDF) is $C^\infty$ everywhere.

Code cell 48

# === 18.1 Activation Function Continuity Analysis ===

import numpy as np
import matplotlib.pyplot as plt
from scipy.special import erf

COLORS = {'primary': '#0077BB', 'secondary': '#EE7733',
          'tertiary': '#009988', 'error': '#CC3311'}

def relu(x): return np.maximum(0, x)
def gelu(x): return x * 0.5 * (1 + erf(x / np.sqrt(2)))
def sigmoid(x): return 1 / (1 + np.exp(-np.clip(x, -500, 500)))

# Verify continuity at x=0: one-sided limits
print('=== Continuity at x=0 ===')
for h in [1e-1, 1e-4, 1e-8, 1e-12]:
    relu_left = relu(-h)
    relu_right = relu(h)
    gelu_left = gelu(-h)
    gelu_right = gelu(h)
    print(f'h={h:.0e}: ReLU({-h:.0e})={relu_left:.2e}, ReLU({h:.0e})={relu_right:.2e} | '
          f'GELU({-h:.0e})={gelu_left:.2e}, GELU({h:.0e})={gelu_right:.2e}')

print()
print('Both lim_{x->0^+} and lim_{x->0^-} equal 0 for ReLU and GELU => both continuous')

# Verify non-differentiability of ReLU at 0
print()
print('=== Differentiability at x=0 ===')
print(f'{"h":>10} | {"ReLU right deriv":>18} | {"ReLU left deriv":>18} | {"GELU deriv":>12}')
print('-' * 68)
for h in [1e-1, 1e-3, 1e-6, 1e-10]:
    relu_rd = (relu(h) - relu(0)) / h
    relu_ld = (relu(-h) - relu(0)) / (-h)
    gelu_cd = (gelu(h) - gelu(-h)) / (2*h)
    print(f'{h:>10.0e} | {relu_rd:>18.6f} | {relu_ld:>18.6f} | {gelu_cd:>12.6f}')

print()
print('ReLU: right deriv -> 1, left deriv -> 0 => NOT differentiable at 0')
print('GELU: centered deriv -> 0.5 = Phi(0) + 0*phi(0) => differentiable (C^inf)')

# Visualization
x = np.linspace(-3, 3, 1000)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Functions
axes[0].plot(x, relu(x), color=COLORS['primary'], lw=2.5, label='ReLU')
axes[0].plot(x, gelu(x), color=COLORS['secondary'], lw=2.5, label='GELU')
axes[0].plot(x, sigmoid(x), color=COLORS['tertiary'], lw=2, ls='--', label='Sigmoid')
axes[0].axvline(0, color='gray', lw=0.8, ls=':')
axes[0].set_xlabel('x', fontsize=12); axes[0].set_ylabel('f(x)', fontsize=12)
axes[0].set_title('Activation Functions', fontsize=13)
axes[0].legend(); axes[0].set_ylim(-1, 3)

# Derivatives (numerical)
h = 1e-5
relu_d = (relu(x + h) - relu(x - h)) / (2*h)
gelu_d = (gelu(x + h) - gelu(x - h)) / (2*h)
sigmoid_d = sigmoid(x) * (1 - sigmoid(x))

axes[1].plot(x, relu_d, color=COLORS['primary'], lw=2.5, label="ReLU'")
axes[1].plot(x, gelu_d, color=COLORS['secondary'], lw=2.5, label="GELU'")
axes[1].plot(x, sigmoid_d, color=COLORS['tertiary'], lw=2, ls='--', label="Sigmoid'")
axes[1].axvline(0, color='gray', lw=0.8, ls=':')
axes[1].set_xlabel('x', fontsize=12); axes[1].set_ylabel("f'(x)", fontsize=12)
axes[1].set_title('Derivatives: ReLU jump vs GELU smooth', fontsize=13)
axes[1].legend()

fig.tight_layout(); plt.show()
print('PASS: continuity and derivative behavior verified')

19. Extreme Value Theorem and Uniform Continuity

EVT: continuous $f$ on $[a,b]$ attains its max and min. Uniform continuity: $\delta$ depends only on $\varepsilon$ , not the point.

Code cell 50

# === 19.1 EVT and Uniform Continuity Demonstration ===

import numpy as np
import matplotlib.pyplot as plt

COLORS = {'primary': '#0077BB', 'secondary': '#EE7733',
          'tertiary': '#009988', 'highlight': '#EE3377'}

# EVT: f(x) = sin(2x) + 0.5*cos(5x) on [0, 2*pi]
f = lambda x: np.sin(2*x) + 0.5*np.cos(5*x)
a, b = 0, 2*np.pi
x_dense = np.linspace(a, b, 10000)
y_dense = f(x_dense)

x_max = x_dense[np.argmax(y_dense)]
x_min = x_dense[np.argmin(y_dense)]
y_max = f(x_max)
y_min = f(x_min)

print('=== Extreme Value Theorem ===')
print(f'f(x) = sin(2x) + 0.5*cos(5x) on [0, 2π]')
print(f'Maximum: f({x_max:.4f}) = {y_max:.6f}')
print(f'Minimum: f({x_min:.4f}) = {y_min:.6f}')
print('Both are attained (not just approached) — EVT guarantee.')

# Uniform continuity: f(x) = sin(x) (uniformly continuous on R)
# vs f(x) = x^2 (NOT uniformly continuous on R)
print()
print('=== Uniform Continuity ===')
eps = 0.1
print(f'eps = {eps}. For sin(x): one delta works everywhere.')
# sin: |sin(x) - sin(y)| <= |x-y| (Lipschitz with L=1)
# So delta = eps works everywhere
delta_sin = eps
print(f'  sin(x): delta = {delta_sin} works uniformly (Lipschitz constant = 1)')

# x^2: need delta = eps/(2|a|+1) -- depends on a
print(f'  x^2 on R: delta depends on location:')
for a_pt in [1, 10, 100, 1000]:
    delta_sq = eps / (2*a_pt + 1)  # rough bound
    print(f'    near a={a_pt}: delta ~ {delta_sq:.4f} (shrinks to 0 as a->inf)')
print('  => x^2 is NOT uniformly continuous on R (delta -> 0 as a -> inf)')

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(x_dense, y_dense, color=COLORS['primary'], lw=2)
axes[0].plot(x_max, y_max, 'v', ms=12, color=COLORS['highlight'], zorder=5, label=f'Max = {y_max:.3f}')
axes[0].plot(x_min, y_min, '^', ms=12, color=COLORS['secondary'], zorder=5, label=f'Min = {y_min:.3f}')
axes[0].axhline(y_max, color=COLORS['highlight'], ls='--', lw=1)
axes[0].axhline(y_min, color=COLORS['secondary'], ls='--', lw=1)
axes[0].set_xlabel('x', fontsize=12); axes[0].set_ylabel('f(x)', fontsize=12)
axes[0].set_title('EVT: max and min attained on $[0, 2\\pi]$', fontsize=13)
axes[0].legend()

x2 = np.linspace(-5, 5, 400)
axes[1].plot(x2, np.sin(x2), color=COLORS['primary'], lw=2.5, label=r'$\sin x$ (uniform)')
axes[1].plot(x2, x2**2 / 10, color=COLORS['secondary'], lw=2.5, label=r'$x^2/10$ (not uniform on $\mathbb{R}$)')
axes[1].set_xlabel('x', fontsize=12)
axes[1].set_title('Uniform vs. pointwise continuity', fontsize=13)
axes[1].legend()

fig.tight_layout(); plt.show()
print('PASS: EVT and uniform continuity demonstrated')

Summary

Concept	Key Idea	ML Application
Limit Definition	$\lim_{x \to a} f(x) = L$	Convergence analysis
One-Sided Limits	Left/right limits may differ	Step functions, ReLU
Fundamental Limits	$\sin(x)/x \to 1$ , $(1+1/n)^n \to e$	Gradient approximations
L'Hôpital's Rule	For 0/0 or ∞/∞ forms	Analyzing loss behavior
Continuity	No breaks, holes, jumps	Activation smoothness
Squeeze Theorem	Bound oscillating functions	Convergence proofs
Softmax Temperature	$T \to 0$ : hard max	Knowledge distillation
Sigmoid Saturation	Limits at ±∞	Vanishing gradients
Learning Rate Decay	$\sum \alpha_t = \infty$ , $\sum \alpha_t^2 < \infty$	SGD convergence
Numerical Stability	Avoid cancellation	Use expm1, log1p

Limits and Continuity