Theory Notebook

Converted from theory.ipynb for web reading.

§20-01 Fourier Series — Theory Notebook

'Any periodic motion is a superposition of simple oscillations.' — Fourier

Interactive derivations for §20-01. Run top-to-bottom.

Section	Content
§1	Orthogonality of the trigonometric system
§2	Computing Fourier coefficients
§3	Square, sawtooth, triangle waveforms
§4	Convergence and Gibbs phenomenon
§5	Parseval's theorem
§6	Shift and differentiation properties
§7	RoPE positional encoding
§8	Spectral bias of neural networks
§9	Random Fourier Features

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import warnings; warnings.filterwarnings('ignore')

try:
    import matplotlib.pyplot as plt, matplotlib as mpl
    try:
        import seaborn as sns
        sns.set_theme(style='whitegrid', palette='colorblind'); HAS_SNS=True
    except ImportError:
        plt.style.use('seaborn-v0_8-whitegrid'); HAS_SNS=False
    mpl.rcParams.update({
        'figure.figsize':(10,6),'figure.dpi':120,'font.size':13,
        'axes.titlesize':15,'axes.labelsize':13,'xtick.labelsize':11,
        'ytick.labelsize':11,'legend.fontsize':11,'lines.linewidth':2.0,
        'axes.spines.top':False,'axes.spines.right':False,
    })
    COLORS={'primary':'#0077BB','secondary':'#EE7733','tertiary':'#009988',
            'error':'#CC3311','neutral':'#555555','highlight':'#EE3377'}
    HAS_MPL=True
except ImportError:
    HAS_MPL=False

np.set_printoptions(precision=6,suppress=True)
np.random.seed(42)
print('Setup complete. HAS_MPL =', HAS_MPL)

1. Orthogonality of the Trigonometric System

The Fourier basis $\{e^{inx}\}_{n\in\mathbb{Z}}$ is orthonormal under $\langle f,g\rangle = \frac{1}{2\pi}\int_{-\pi}^{\pi}f\bar{g}\,dx$ :

\langle e^{inx}, e^{imx}\rangle = \delta_{nm}

This makes Fourier coefficients coordinates: $c_n = \langle f, e^{inx}\rangle$ .

Code cell 5

# === 1.1 Verify orthonormality numerically ===
N_pts = 10000
x = np.linspace(-np.pi, np.pi, N_pts, endpoint=False)
dx = 2*np.pi / N_pts

def inner_prod(f, g):
    return np.sum(f * np.conj(g)) * dx / (2*np.pi)

tests = [(1,1,1.0), (1,2,0.0), (3,-3,0.0), (0,0,1.0)]
print('Orthonormality checks:')
for n, m, expected in tests:
    val = inner_prod(np.exp(1j*n*x), np.exp(1j*m*x))
    status = 'PASS' if np.isclose(abs(val), abs(expected), atol=1e-3) else 'FAIL'
    print(f'  {status}  <e^{{i{n}x}}, e^{{i{m}x}}> = {val.real:.5f}+{val.imag:.5f}j  (expected {expected})')

# Gram matrix for n = -2..2
ns = range(-2,3)
G = np.array([[inner_prod(np.exp(1j*n*x), np.exp(1j*m*x)) for m in ns] for n in ns])
print(f'\nGram matrix (abs values, should be identity):')
print(np.abs(G).round(4))
print('PASS - Gram matrix is identity' if np.allclose(G, np.eye(5), atol=1e-3) else 'FAIL')

2. Computing Fourier Coefficients

c_n = \frac{1}{2\pi}\int_{-\pi}^{\pi} f(x)\,e^{-inx}\,dx

We verify numerical computations against analytic formulas for three standard waveforms.

Code cell 7

# === 2.1 Fourier coefficient computation ===

def c_n(f_vals, n):
    """Numerical Fourier coefficient c_n via rectangle rule."""
    return np.sum(f_vals * np.exp(-1j*n*x)) * dx / (2*np.pi)

# Square wave: c_n = 2/(i*pi*n) for odd n, 0 for even n
f_sq = np.sign(np.sin(x))
print('Square wave c_n (complex form):')
for n in [1,2,3,4,5]:
    num = c_n(f_sq, n)
    analytic = 2/(1j*np.pi*n) if n%2==1 else 0.0
    ok = np.isclose(num, analytic, atol=1e-3)
    print(f'  c_{n}: num={num:.4f}  analytic={analytic:.4f}  {"PASS" if ok else "FAIL"}')

# Triangle wave: c_n = -2/(pi*n^2) for odd n, c_0=pi/2
f_tri = np.abs(x)
print('\nTriangle wave c_n (real, cosine series):')
for n in [0,1,2,3]:
    num = c_n(f_tri, n)
    if n==0:  analytic = np.pi/2
    elif n%2==1: analytic = -2/(np.pi*n**2)
    else: analytic = 0.0
    ok = np.isclose(num.real, analytic, atol=1e-3)
    print(f'  c_{n}: num={num.real:.5f}  analytic={analytic:.5f}  {"PASS" if ok else "FAIL"}')

3. Partial Sums and Convergence

S_N f(x) = \sum_{n=-N}^{N} c_n e^{inx}

We visualize convergence for all three waveforms and measure the $L^2$ error rate.

Code cell 9

# === 3.1 Partial sum functions ===

def sq_sum(x, N):
    return 4/np.pi * sum(np.sin((2*k+1)*x)/(2*k+1) for k in range(N+1))

def tri_sum(x, N):
    s = np.full_like(x, np.pi/2)
    s -= 4/np.pi * sum(np.cos((2*k+1)*x)/(2*k+1)**2 for k in range(N+1))
    return s

def saw_sum(x, N):
    return 2*sum((-1)**(n+1)*np.sin(n*x)/n for n in range(1,N+1))

x_p = np.linspace(-np.pi, np.pi, 2000)

# L2 error vs N
print('L2 error vs N:')
print(f'  {"N":>4}  {"Square":>10}  {"Triangle":>10}  {"Sawtooth":>10}')
for N in [1,3,5,10,20,50,100]:
    e_sq = np.sqrt(np.mean((np.sign(np.sin(x_p)) - sq_sum(x_p,N))**2))
    e_tr = np.sqrt(np.mean((np.abs(x_p) - tri_sum(x_p,N))**2))
    e_sw = np.sqrt(np.mean((x_p - saw_sum(x_p,N))**2))
    print(f'  {N:>4}  {e_sq:>10.5f}  {e_tr:>10.5f}  {e_sw:>10.5f}')

Code cell 10

# === 3.2 Visualise partial sums ===
if HAS_MPL:
    fig, axes = plt.subplots(1,3, figsize=(15,5))
    fig.suptitle('Fourier series convergence', fontsize=15)
    specs = [
        ('Square wave', np.sign(np.sin(x_p)), sq_sum),
        ('Triangle $|x|$', np.abs(x_p),       tri_sum),
        ('Sawtooth $x$',   x_p,                saw_sum),
    ]
    for ax, (name, f_true, fn) in zip(axes, specs):
        ax.plot(x_p, f_true, color=COLORS['neutral'], lw=1.5, label='True')
        for N, col in [(5,COLORS['secondary']),(20,COLORS['primary'])]:
            ax.plot(x_p, fn(x_p,N), color=col, lw=1.5, label=f'N={N}')
        ax.set_title(name); ax.set_xlabel('$x$'); ax.legend(fontsize=9)
    fig.tight_layout(); plt.show()
    print('Plot: convergence for three standard waveforms')

4. The Gibbs Phenomenon

Near a jump discontinuity, $S_N f$ overshoots by $\approx 9\%$ of the jump height, regardless of $N$ . This overshoot is $\frac{2}{\pi}\text{Si}(\pi)-1\approx 0.0895$ .

Code cell 12

# === 4.1 Quantify the Gibbs overshoot ===
x_zoom = np.linspace(-0.5, 0.5, 5000)
print('Gibbs overshoot vs N (jump height = 2, so 9% = 0.179):')
for N in [10,50,100,500,1000]:
    S = sq_sum(x_zoom, N)
    overshoot = np.max(S) - 1.0
    print(f'  N={N:5d}: max={np.max(S):.5f}  overshoot={overshoot:.5f}  ({100*overshoot/2:.2f}%)')

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(10,5))
    ax.plot(x_zoom, np.sign(np.sin(x_zoom)), color=COLORS['neutral'], lw=1.5, label='True')
    for N, col in [(5,COLORS['secondary']),(50,COLORS['primary'])]:
        ax.plot(x_zoom, sq_sum(x_zoom,N), color=col, lw=1.8, label=f'$S_{{{N}}}f$')
    ax.axhline(1.0+0.0895*2, color=COLORS['error'], ls='--', lw=1.2, label='9% overshoot')
    ax.set_title('Gibbs phenomenon: 9% overshoot persists as $N\\to\\infty$')
    ax.set_xlabel('$x$'); ax.set_ylabel('$S_Nf(x)$'); ax.legend()
    fig.tight_layout(); plt.show()

Code cell 13

# === 4.2 Cesaro means vs partial sums (Fejer's theorem) ===

def cesaro_sum(x, N, fn):
    """Cesaro mean sigma_N = (1/(N+1)) * sum_{k=0}^N S_k f"""
    return sum(fn(x,k) for k in range(N+1)) / (N+1)

x_zoom = np.linspace(-0.5, 0.5, 2000)
N = 50
S_N = sq_sum(x_zoom, N)
sigma_N = cesaro_sum(x_zoom, N, sq_sum)

print(f'Partial sum max: {np.max(S_N):.5f}  (Gibbs present)')
print(f'Cesaro mean max: {np.max(sigma_N):.5f}  (should be close to 1.0 — no Gibbs)')

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(10,5))
    ax.plot(x_zoom, np.sign(np.sin(x_zoom)), color=COLORS['neutral'], lw=1.5, label='True')
    ax.plot(x_zoom, S_N, color=COLORS['error'], lw=1.8, label=f'$S_{{{N}}}f$ (Gibbs)')
    ax.plot(x_zoom, sigma_N, color=COLORS['primary'], lw=1.8, label=f'$\\sigma_{{{N}}}f$ (Fejer)')
    ax.set_title("Fejer's theorem: Cesaro means remove Gibbs overshoot")
    ax.set_xlabel('$x$'); ax.legend(); fig.tight_layout(); plt.show()

5. Parseval's Theorem: Energy Conservation

\frac{1}{2\pi}\int_{-\pi}^{\pi}|f(x)|^2\,dx = \sum_{n=-\infty}^{\infty}|c_n|^2

The total energy (power) in the time domain equals the sum of energies in each frequency component. This is a unitary change of basis.

Application: Setting $f(x)=x$ and applying Parseval yields the Basel sum $\sum_{n=1}^\infty 1/n^2 = \pi^2/6$ .

Code cell 15

# === 5.1 Verify Parseval numerically ===
N_pts = 10000
x = np.linspace(-np.pi, np.pi, N_pts, endpoint=False)
dx = 2*np.pi/N_pts

def parseval_check(f_vals, N_terms=200):
    lhs = np.sum(np.abs(f_vals)**2)*dx / (2*np.pi)   # ||f||^2
    rhs = sum(abs(np.sum(f_vals*np.exp(-1j*n*x))*dx/(2*np.pi))**2
              for n in range(-N_terms, N_terms+1))
    return lhs, rhs

for name, fv in [('Square wave', np.sign(np.sin(x))),
                  ('Triangle',   np.abs(x)),
                  ('cos(x)',     np.cos(x))]:
    lhs, rhs = parseval_check(fv)
    ok = np.isclose(lhs, rhs, rtol=1e-2)
    print(f'  {"PASS" if ok else "FAIL"}  {name}: ||f||^2={lhs:.5f}  sum|c_n|^2={rhs:.5f}')

Code cell 16

# === 5.2 Basel problem via Parseval ===
# Apply Parseval to f(x) = x on (-pi, pi):
# ||f||^2 = (1/2pi)*integral x^2 dx = pi^2/3
# sum |c_n|^2 = sum_{n!=0} 1/n^2 = 2*sum_{n=1}^inf 1/n^2
# => sum 1/n^2 = pi^2/6

f_saw = x  # sawtooth on (-pi,pi)
lhs_exact = np.pi**2 / 3          # (1/2pi)*integral_{-pi}^{pi} x^2 dx
lhs_num = np.sum(f_saw**2)*dx/(2*np.pi)
print(f'||x||^2 exact = pi^2/3 = {lhs_exact:.6f}')
print(f'||x||^2 numerical     = {lhs_num:.6f}')

# sum |c_n|^2 using b_n = 2*(-1)^{n+1}/n
N_sum = 10000
series_sum = sum(4/n**2 for n in range(1, N_sum+1))  # sum_{n=1}^inf b_n^2 = sum 4/n^2
# Parseval: lhs = sum b_n^2 => pi^2/3 = 4*sum 1/n^2 => sum 1/n^2 = pi^2/12 ... wait
# Actually ||f||^2 = (1/2pi)*int x^2 dx = pi^2/3, sum |c_n|^2 = sum 1/n^2
# c_n = (-1)^{n+1}/(in), so |c_n|^2 = 1/n^2
sum_inv_n2 = series_sum / 4          # = sum 1/n^2  (since sum b_n^2/4 = sum 1/n^2)
print(f'\nParseval => sum_{{n=1}}^{{inf}} 1/n^2 = {sum_inv_n2:.6f}')
print(f'pi^2/6 = {np.pi**2/6:.6f}')
print(f'PASS - Basel problem verified' if np.isclose(sum_inv_n2, np.pi**2/6, rtol=1e-3)
      else 'FAIL')

6. Properties: Shift and Differentiation

Shift theorem: $c_n[f(\cdot - x_0)] = e^{-inx_0}\,c_n[f]$
Differentiation: $c_n[f'] = in\,c_n[f]$

These two properties power the entire theory of positional encodings in transformers.

Code cell 18

# === 6.1 Shift theorem ===
N_pts = 10000
x = np.linspace(-np.pi, np.pi, N_pts, endpoint=False)
dx = 2*np.pi/N_pts

def c_n(fv, n): return np.sum(fv*np.exp(-1j*n*x))*dx/(2*np.pi)

f = np.cos(3*x) + 0.5*np.sin(x)   # test function
x0 = 0.7
f_shifted = np.cos(3*(x-x0)) + 0.5*np.sin(x-x0)

print('Shift theorem: c_n[f(x-x0)] = exp(-i*n*x0) * c_n[f]')
for n in [1, 2, 3]:
    lhs = c_n(f_shifted, n)
    rhs = np.exp(-1j*n*x0) * c_n(f, n)
    ok = np.isclose(lhs, rhs, atol=1e-4)
    print(f'  n={n}: PASS' if ok else f'  n={n}: FAIL  lhs={lhs:.5f} rhs={rhs:.5f}')

Code cell 19

# === 6.2 Differentiation theorem: c_n[f'] = i*n * c_n[f] ===
# f(x) = x^2 on (-pi,pi), f'(x) = 2x
# c_n[x^2] = 2*(-1)^n/n^2, so c_n[2x] = 2*(-1)^{n+1}/n
# differentiation theorem: c_n[2x] = i*n * c_n[x^2] = i*n * 2*(-1)^n/n^2 = 2i*(-1)^n/n

f_x2 = x**2
f_2x = 2*x
print('Differentiation theorem: c_n[f\'] = i*n * c_n[f]')
for n in [1,2,3,4,5]:
    cn_f   = c_n(f_x2, n)
    cn_fp  = c_n(f_2x, n)
    predicted = 1j*n*cn_f
    ok = np.isclose(cn_fp, predicted, atol=1e-3)
    print(f'  n={n}: {"PASS" if ok else "FAIL"}  c_n[2x]={cn_fp:.5f}  i*n*c_n[x^2]={predicted:.5f}')

7. AI Application: Rotary Positional Encoding (RoPE)

RoPE (Su et al., 2021) encodes position $m$ as a rotation by $m\theta_i$ :

q'_{m,i} = (q_{2i}+iq_{2i+1})\cdot e^{im\theta_i}, \quad\theta_i = 10000^{-2i/d}

The attention score $q_m^\top k_n$ depends only on $m-n$ (relative position).

Code cell 21

# === 7.1 Implement RoPE and verify relative-position property ===

def rope_rotate(x_vec, pos, base=10000):
    """Apply RoPE rotation to a vector at given position."""
    d = len(x_vec)
    theta = np.array([base**(-2*i/d) for i in range(d//2)])
    angles = pos * theta
    cos_a, sin_a = np.cos(angles), np.sin(angles)
    x_even, x_odd = x_vec[0::2], x_vec[1::2]
    out = np.empty_like(x_vec)
    out[0::2] = x_even*cos_a - x_odd*sin_a
    out[1::2] = x_even*sin_a + x_odd*cos_a
    return out

np.random.seed(42)
d = 64
q = np.random.randn(d)
k = np.random.randn(d)

# Verify: score(m,n) == score(m+delta, n+delta)
print('RoPE relative-position property: score(m,n) = score(m+k, n+k)')
for m, n in [(0,3), (5,10), (100,107)]:
    delta = 50
    score1 = rope_rotate(q,m) @ rope_rotate(k,n)
    score2 = rope_rotate(q,m+delta) @ rope_rotate(k,n+delta)
    ok = np.isclose(score1, score2, atol=1e-8)
    print(f'  m={m:3d},n={n:3d}: score={score1:.6f}  (m+{delta},n+{delta})={score2:.6f}  {"PASS" if ok else "FAIL"}')

Code cell 22

# === 7.2 Visualise RoPE rotation as Fourier basis ===
if HAS_MPL:
    d = 8; base = 10000
    positions = np.arange(64)
    theta = np.array([base**(-2*i/d) for i in range(d//2)])

    fig, axes = plt.subplots(1, 4, figsize=(16, 4))
    fig.suptitle('RoPE: rotation angle per dimension pair vs. position', fontsize=13)
    for k_idx, ax in enumerate(axes):
        angles = positions * theta[k_idx]
        ax.plot(positions, np.cos(angles), color=COLORS['primary'],  label='cos')
        ax.plot(positions, np.sin(angles), color=COLORS['secondary'], label='sin')
        ax.set_title(f'Dim pair {k_idx} (freq={theta[k_idx]:.4f})')
        ax.set_xlabel('Position $m$')
        if k_idx==0: ax.legend(fontsize=9)
    fig.tight_layout(); plt.show()
    print('Each pair rotates at frequency theta_i = 10000^{-2i/d}.')
    print('Lower-indexed pairs complete full cycles faster => distinguish nearby tokens.')

8. Spectral Bias of Neural Networks

Neural networks learn low-frequency Fourier components of the target function first (Rahaman et al., 2019). We demonstrate this by training a small MLP on a high-frequency target and tracking Fourier coefficient convergence.

Code cell 24

# === 8.1 Spectral bias demonstration ===
# Target: f*(x) = sin(x) + 0.3*sin(5x) + 0.1*sin(11x)
# We'll fit with linear regression (proxy for MLP with linear activations)
# and observe which frequencies are captured first.

np.random.seed(42)
N_data = 200
x_data = np.linspace(-np.pi, np.pi, N_data)

def target(x):
    return np.sin(x) + 0.3*np.sin(5*x) + 0.1*np.sin(11*x)

y_data = target(x_data)

# Build Fourier feature matrix: columns = [sin(x), cos(x), sin(2x), ..., sin(Kx), cos(Kx)]
def fourier_features(x, K):
    cols = [np.ones_like(x)]
    for k in range(1, K+1):
        cols += [np.sin(k*x), np.cos(k*x)]
    return np.stack(cols, axis=1)

K = 15
Phi = fourier_features(x_data, K)  # (N, 2K+1)

# Fit with ridge regression (lambda controls implicit frequency ordering)
# Use increasing lambda to simulate "early" vs "late" in training
print('Spectral content recovered at different regularization levels:')
print(f'  (lambda=large -> only low freq; lambda=small -> all freq)')
for lam in [100, 10, 1, 0.01]:
    w = np.linalg.solve(Phi.T@Phi + lam*np.eye(Phi.shape[1]), Phi.T@y_data)
    y_pred = Phi @ w
    # Extract Fourier coefficients of prediction
    b_1  = w[1]   # sin(x)  coefficient
    b_5  = w[9]   # sin(5x) coefficient
    b_11 = w[21]  # sin(11x) coefficient
    print(f'  lambda={lam:6.2f}: b_1={b_1:.3f}(true 1.0) b_5={b_5:.3f}(true 0.3) b_11={b_11:.3f}(true 0.1)')

9. Random Fourier Features (Rahimi & Recht, 2007)

Any shift-invariant kernel $k(\mathbf{x}-\mathbf{y})$ is the Fourier transform of a non-negative density $p(\boldsymbol{\omega})$ (Bochner's theorem). We can approximate $k(\mathbf{x}-\mathbf{y}) \approx \phi(\mathbf{x})^\top\phi(\mathbf{y})$ by sampling $D$ random frequencies $\boldsymbol{\omega}_j \sim p$ .

Code cell 26

# === 9.1 Random Fourier Features for the RBF kernel ===

def rff_kernel_approx(X, D, sigma=1.0, seed=42):
    """Approximate RBF kernel k(x,y)=exp(-||x-y||^2/(2*sigma^2)) via D random features."""
    rng = np.random.default_rng(seed)
    d = X.shape[1]
    # Sample frequencies from N(0, sigma^{-2} I)
    W = rng.normal(0, 1.0/sigma, size=(d, D))
    b = rng.uniform(0, 2*np.pi, size=D)
    # Feature map: phi(x) = sqrt(2/D) * cos(W^T x + b)
    Z = np.sqrt(2.0/D) * np.cos(X @ W + b[None,:])
    return Z

np.random.seed(42)
n, d = 100, 2
X = np.random.randn(n, d)
sigma = 1.0

# Exact RBF kernel matrix
from scipy.spatial.distance import cdist
K_exact = np.exp(-cdist(X, X)**2 / (2*sigma**2))

print('RFF approximation quality vs D:')
print(f'  {"D":>6}  {"Max error":>12}  {"Mean error":>12}')
for D in [10, 50, 100, 500, 1000]:
    Z = rff_kernel_approx(X, D, sigma)
    K_approx = Z @ Z.T
    err = np.abs(K_exact - K_approx)
    print(f'  {D:>6}  {np.max(err):>12.5f}  {np.mean(err):>12.5f}')

Code cell 27

# === 9.2 Visualise kernel approximation quality ===
if HAS_MPL:
    Ds = [10, 50, 100, 500, 1000, 5000]
    max_errs = []
    for D in Ds:
        Z = rff_kernel_approx(X, D, sigma)
        K_approx = Z @ Z.T
        max_errs.append(np.max(np.abs(K_exact - K_approx)))

    fig, ax = plt.subplots(figsize=(9,5))
    ax.loglog(Ds, max_errs, 'o-', color=COLORS['primary'], lw=2, label='Max error')
    ax.loglog(Ds, [0.5/np.sqrt(D) for D in Ds], '--',
              color=COLORS['secondary'], lw=1.5, label='$O(1/\\sqrt{D})$')
    ax.set_title('Random Fourier Features: approximation error vs. D')
    ax.set_xlabel('Number of random features $D$')
    ax.set_ylabel('Max absolute error')
    ax.legend(); fig.tight_layout(); plt.show()
    print('Error decays as O(1/sqrt(D)) — consistent with concentration bounds.')

10. Smoothness and Spectral Decay

The key theorem: if $f \in C^k$ (k-times continuously differentiable and periodic), then $|c_n| = O(1/|n|^k)$ . We verify this empirically.

Code cell 29

# === 10.1 Spectral decay rate vs smoothness ===
N_pts = 10000
x = np.linspace(-np.pi, np.pi, N_pts, endpoint=False)
dx = 2*np.pi/N_pts
def c_n(fv, n): return np.sum(fv*np.exp(-1j*n*x))*dx/(2*np.pi)

# Four functions with different smoothness
functions = {
    'Square (discont.)': (np.sign(np.sin(x)), 0),   # C^{-1}: 1/n decay
    'Triangle (C^0)':   (np.abs(x),           1),   # C^0: 1/n^2 decay
    'f=x^2 (C^1)':      (x**2,                2),   # C^1: 1/n^2 decay
    'Gaussian bump':    (np.exp(-x**2/(0.5)), -1),  # Analytic: exp decay
}

ns_test = np.arange(1, 30)
print('|c_n| for n=1..29:')
coeff_data = {}
for name, (fv, _) in functions.items():
    coeffs = [abs(c_n(fv, n)) for n in ns_test]
    coeff_data[name] = coeffs
    print(f'  {name}: |c_1|={coeffs[0]:.4f}, |c_5|={coeffs[4]:.4f}, |c_10|={coeffs[9]:.4f}')

Code cell 30

# === 10.2 Log-log plot of spectral decay ===
if HAS_MPL:
    fig, ax = plt.subplots(figsize=(10,6))
    colors_list = [COLORS['error'], COLORS['secondary'],
                   COLORS['primary'], COLORS['tertiary']]
    for (name, coeffs), col in zip(coeff_data.items(), colors_list):
        ax.loglog(ns_test, coeffs, 'o-', color=col, lw=1.5,
                  markersize=4, label=name)
    # Reference lines
    ax.loglog(ns_test, 1.0/ns_test, '--', color=COLORS['neutral'],
              lw=1, label='$1/n$ (C^{-1})')
    ax.loglog(ns_test, 1.0/ns_test**2, ':',  color=COLORS['neutral'],
              lw=1, label='$1/n^2$ (C^0)')
    ax.set_title('Spectral decay rate vs. function smoothness')
    ax.set_xlabel('Frequency $n$')
    ax.set_ylabel('$|c_n|$')
    ax.legend(fontsize=9); fig.tight_layout(); plt.show()
    print('Smooth functions: fast decay. Discontinuous: slow 1/n decay.')

11. The Dirichlet Kernel

D_N(x) = \sum_{n=-N}^N e^{inx} = \frac{\sin((N+1/2)x)}{\sin(x/2)}

$S_N f = f * D_N$ (convolution). Unlike the Fejér kernel, $D_N$ is not non-negative — this causes the Gibbs phenomenon.

Code cell 32

# === 11.1 Dirichlet kernel: closed form vs direct sum ===
N_pts = 2000
x = np.linspace(-np.pi, np.pi, N_pts, endpoint=False)
x[x==0] = 1e-12  # avoid division by zero

def dirichlet_kernel(x, N):
    return np.sin((N+0.5)*x) / np.sin(x/2)

def fejer_kernel(x, N):
    s = np.sin((N+1)*x/2) / np.sin(x/2)
    return s**2 / (N+1)

for N in [5, 20]:
    D = dirichlet_kernel(x, N)
    integral = np.trapezoid(D, x) / (2*np.pi)
    min_val  = np.min(D)
    print(f'D_{N}: integral/(2pi)={integral:.5f} (should=1), min={min_val:.3f} (negative!)')

    F = fejer_kernel(x, N)
    fi = np.trapezoid(F, x) / (2*np.pi)
    fm = np.min(F)
    print(f'F_{N}: integral/(2pi)={fi:.5f} (should=1), min={fm:.6f} (non-negative!)')

Code cell 33

# === 11.2 Visualise Dirichlet vs Fejer kernels ===
if HAS_MPL:
    x_k = np.linspace(-np.pi, np.pi, 3000)
    x_k[x_k==0] = 1e-12
    fig, axes = plt.subplots(1,2,figsize=(14,5))
    for ax, N_k in zip(axes, [5,20]):
        D = dirichlet_kernel(x_k, N_k)
        F = fejer_kernel(x_k, N_k)
        ax.plot(x_k, D, color=COLORS['error'], lw=1.8, label=f'$D_{{{N_k}}}$ (Dirichlet)')
        ax.plot(x_k, F, color=COLORS['primary'], lw=1.8, label=f'$F_{{{N_k}}}$ (Fejer)')
        ax.axhline(0, color=COLORS['neutral'], lw=0.8, ls='--')
        ax.set_title(f'N = {N_k}')
        ax.set_xlabel('$x$'); ax.legend()
    fig.suptitle('Dirichlet kernel (negative) vs Fejer kernel (non-negative)', fontsize=14)
    fig.tight_layout(); plt.show()

print("audit output: 11.2 Visualise Dirichlet vs Fejer kernels === complete or optional branch skipped.")

12. Even/Odd Symmetry and Cosine/Sine Series

Even functions $f(-x)=f(x)$ : all $b_n=0$ , series is purely cosines (DCT connection)
Odd functions $f(-x)=-f(x)$ : all $a_n=0$ , series is purely sines

This halves the computation and reveals the symmetry structure of the signal.

Code cell 35

# === 12.1 Even/odd decomposition ===
N_pts = 10000
x = np.linspace(-np.pi, np.pi, N_pts, endpoint=False)
dx = 2*np.pi/N_pts
def c_n(fv, n): return np.sum(fv*np.exp(-1j*n*x))*dx/(2*np.pi)

# Any function = even part + odd part
f = np.exp(np.sin(x))   # generic function
f_even = (f + f[::-1]) / 2
f_odd  = (f - f[::-1]) / 2

print('For even part: imaginary Fourier coefficients should be ~0')
for n in [1,2,3]:
    c = c_n(f_even, n)
    print(f'  c_{n}[even]: real={c.real:.5f}, imag={c.imag:.6f} (imag~0 = PASS)')

print('\nFor odd part: real Fourier coefficients should be ~0')
for n in [1,2,3]:
    c = c_n(f_odd, n)
    print(f'  c_{n}[odd]: real={c.real:.6f} (real~0 = PASS), imag={c.imag:.5f}')

13. Fourier Series on $[-L, L]$

For a $2L$ -periodic function, replace $e^{inx}$ with $e^{in\pi x/L}$ :

f(x) = \sum_{n=-\infty}^{\infty} c_n \, e^{in\pi x/L}, \quad c_n = \frac{1}{2L}\int_{-L}^L f(x)\,e^{-in\pi x/L}\,dx

The fundamental frequency is $f_1 = 1/(2L)$ . LLM connection: The transformer context length $T$ corresponds to $2L = T$ ; different position encoding frequencies correspond to harmonics $n/T$ .

Code cell 37

# === 13.1 Sinusoidal positional encoding — transformer style ===

def sinusoidal_PE(seq_len, d_model, base=10000):
    """Compute sinusoidal positional encoding matrix (seq_len, d_model)."""
    PE = np.zeros((seq_len, d_model))
    pos = np.arange(seq_len)[:, None]              # (T,1)
    i   = np.arange(0, d_model, 2)[None, :]        # (1, d/2)
    theta = pos / base**(i / d_model)              # (T, d/2)
    PE[:, 0::2] = np.sin(theta)
    PE[:, 1::2] = np.cos(theta)
    return PE

T, d = 64, 32
PE = sinusoidal_PE(T, d)
print(f'PE shape: {PE.shape}')
print(f'PE[0]:  {PE[0,:6].round(3)} ...')
print(f'PE[1]:  {PE[1,:6].round(3)} ...')
print(f'PE[T-1]:{PE[-1,:6].round(3)} ...')

# Verify the shift-invariance of inner products
k = 5
print(f'\nPE[i] . PE[j] for j=i+{k} (should be constant for all i):')
dots = [PE[i] @ PE[i+k] for i in range(T - k)]
print(f'  min={min(dots):.4f}, max={max(dots):.4f}, std={np.std(dots):.4f}')
print(f'  (Not perfectly constant — RoPE fixes this)')

Code cell 38

# === 13.2 Visualise PE heatmap ===
if HAS_MPL:
    fig, ax = plt.subplots(figsize=(12,5))
    im = ax.imshow(PE.T, aspect='auto', cmap='RdBu_r', vmin=-1, vmax=1)
    fig.colorbar(im, ax=ax, label='Value')
    ax.set_title('Sinusoidal positional encodings (rows=dimensions, cols=positions)')
    ax.set_xlabel('Position $m$')
    ax.set_ylabel('Dimension $2i$ or $2i+1$')
    fig.tight_layout(); plt.show()
    print('Low-indexed dims: fast oscillation (high freq, local context)')
    print('High-indexed dims: slow oscillation (low freq, global context)')

14. Numerical Experiment: Parseval vs Truncation

How much energy does $S_N f$ capture? Parseval tells us exactly:

\frac{\lVert S_N f\rVert^2}{\lVert f\rVert^2} = \frac{\sum_{|n|\leq N}|c_n|^2}{\sum_n|c_n|^2}

For smooth functions, near-100% energy is captured with few terms.

Code cell 40

# === 14.1 Energy captured by N-term truncation ===
N_pts = 10000
x = np.linspace(-np.pi, np.pi, N_pts, endpoint=False)
dx = 2*np.pi/N_pts
def c_n(fv, n): return np.sum(fv*np.exp(-1j*n*x))*dx/(2*np.pi)

def energy_captured(fv, N_max=200, N_truncs=[1,3,5,10,20,50]):
    all_c = [c_n(fv, n) for n in range(-N_max, N_max+1)]
    total_energy = sum(abs(c)**2 for c in all_c)
    results = {}
    for N in N_truncs:
        captured = sum(abs(c_n(fv,n))**2 for n in range(-N, N+1))
        results[N] = 100 * captured / total_energy
    return results

f_sq  = np.sign(np.sin(x))
f_tri = np.abs(x)
f_gau = np.exp(-0.5*x**2)

print('% energy captured by N-term truncation:')
print(f"{"N":>4}  {"Square":>10}  {"Triangle":>10}  {"Gaussian":>10}")
for N in [1,3,5,10,20,50]:
    e_sq  = energy_captured(f_sq,  N_truncs=[N])[N]
    e_tri = energy_captured(f_tri, N_truncs=[N])[N]
    e_gau = energy_captured(f_gau, N_truncs=[N])[N]
    print(f"{N:>4}  {e_sq:>9.2f}%  {e_tri:>9.2f}%  {e_gau:>9.2f}%")

Code cell 41

# === 14.2 Amplitude spectrum plot ===
if HAS_MPL:
    n_range = np.arange(-30, 31)
    fig, axes = plt.subplots(1,3, figsize=(15,5))
    fig.suptitle('Amplitude spectrum $|c_n|$ for three waveforms', fontsize=14)
    specs = [
        ('Square wave', np.sign(np.sin(x))),
        ('Triangle $|x|$', np.abs(x)),
        ('Gaussian $e^{-x^2/2}$', np.exp(-0.5*x**2)),
    ]
    for ax, (name, fv) in zip(axes, specs):
        amps = [abs(c_n(fv, n)) for n in n_range]
        ax.stem(n_range, amps, linefmt='C0-',
                markerfmt='o', basefmt='k-')
        ax.set_title(name); ax.set_xlabel('$n$'); ax.set_ylabel('$|c_n|$')
    fig.tight_layout(); plt.show()

print("audit output: 14.2 Amplitude spectrum plot === complete or optional branch skipped.")

15. Complete Example: Heat Equation Solution

The solution of $u_t = \alpha^2 u_{xx}$ on $[0,\pi]$ with $u(0,t)=u(\pi,t)=0$ and $u(x,0) = x(\pi-x)$ uses the Fourier sine series derived in notes.md §F.1:

u(x,t) = \frac{8}{\pi}\sum_{k=0}^\infty \frac{1}{(2k+1)^3} \sin((2k+1)x)\,e^{-\alpha^2(2k+1)^2 t}

Code cell 43

# === 15.1 Heat equation solution via Fourier series ===

def heat_solution(x_arr, t, alpha=1.0, N_terms=50):
    u = np.zeros_like(x_arr, dtype=float)
    for k in range(N_terms):
        n = 2*k + 1
        u += (1/n**3) * np.sin(n*x_arr) * np.exp(-(alpha*n)**2 * t)
    return u * 8/np.pi

x_arr = np.linspace(0, np.pi, 500)
f0 = x_arr*(np.pi - x_arr)   # initial condition

print('Heat equation solution at various times:')
print(f'  t=0: max(u) = {np.max(heat_solution(x_arr, 0)):.5f}  (true max = pi^2/4 = {np.pi**2/4:.5f})')
for t in [0.01, 0.1, 0.5, 2.0]:
    u = heat_solution(x_arr, t)
    print(f'  t={t}: max(u) = {np.max(u):.6f}')

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(10,6))
    ax.plot(x_arr, f0, color=COLORS['neutral'], lw=2, label='$t=0$ (initial)')
    for t, col in [(0.05, COLORS['secondary']), (0.2, COLORS['primary']),
                   (0.5, COLORS['tertiary']), (1.0, COLORS['error'])]:
        ax.plot(x_arr, heat_solution(x_arr, t), color=col, lw=1.8, label=f'$t={t}$')
    ax.set_title('Heat equation: high frequencies decay fastest')
    ax.set_xlabel('$x$'); ax.set_ylabel('$u(x,t)$'); ax.legend()
    fig.tight_layout(); plt.show()
    print('High-frequency modes (large n) decay as exp(-(alpha*n)^2 * t) — very fast.')

16. Fourier Series and the DFT Connection

The Discrete Fourier Transform (full treatment in §20-03) computes Fourier series coefficients for finite sequences. Here we preview the connection:

X[k] = \sum_{n=0}^{N-1} x[n]\,e^{-2\pi ikn/N} = N\cdot c_k

where $c_k$ is the $k$ -th Fourier coefficient of the sampled function. The FFT computes all $N$ coefficients in $O(N\log N)$ .

Code cell 45

# === 16.1 DFT vs direct Fourier coefficient computation ===
N_pts = 64
x = np.linspace(-np.pi, np.pi, N_pts, endpoint=False)
dx = 2*np.pi/N_pts
def c_n_rect(fv, n): return np.sum(fv*np.exp(-1j*n*x))*dx/(2*np.pi)

f_test = np.cos(3*x) + 0.5*np.sin(5*x)

# Direct coefficients
ns = np.arange(-8,9)
direct = np.array([c_n_rect(f_test, n) for n in ns])

# Via FFT
F = np.fft.fft(f_test)
# FFT gives: F[k] = N * c_k for k=0..N-1
# Negative freqs: F[N-k] = N * c_{-k}
fft_coeffs = np.array([
    F[n]/N_pts if n >= 0 else F[N_pts+n]/N_pts for n in ns
])

print('Comparison: direct Fourier coefficients vs FFT/N:')
print(f"{"n":>4}  {"Direct":>15}  {"FFT/N":>15}  Match")
for n, d, f_c in zip(ns, direct, fft_coeffs):
    ok = np.isclose(d, f_c, atol=1e-8)
    if abs(d) > 1e-6:
        print(f"{n:>4}  {d.real:>7.4f}+{d.imag:>6.4f}j  {f_c.real:>7.4f}+{f_c.imag:>6.4f}j  {"PASS" if ok else "FAIL"}")

17. Amplitude and Phase Spectrum

The Fourier series $f(x)=\sum c_n e^{inx}$ can be written as:

f(x) = |c_0| + \sum_{n=1}^\infty 2|c_n|\cos(nx + \arg c_n)

Amplitude spectrum: $|c_n|$ — how much of each frequency is present
Phase spectrum: $\arg(c_n)$ — timing offset of each frequency component

Changing phases while preserving amplitudes destroys signal structure — this is why phase alignment matters in audio and image processing.

Code cell 47

# === 17.1 Phase scrambling destroys signal structure ===
np.random.seed(42)
N_pts = 10000
x = np.linspace(-np.pi, np.pi, N_pts, endpoint=False)
dx = 2*np.pi/N_pts
def c_n(fv, n): return np.sum(fv*np.exp(-1j*n*x))*dx/(2*np.pi)

f = np.sign(np.sin(x))   # square wave
N_coeff = 50

# Reconstruct from coefficients
def reconstruct(coeffs_dict):
    return sum(c * np.exp(1j*n*x) for n, c in coeffs_dict.items()).real

# Original coefficients
orig = {n: c_n(f, n) for n in range(-N_coeff, N_coeff+1)}

# Phase-scrambled: preserve |c_n| but randomize phase
scrambled = {n: abs(c)*np.exp(1j*np.random.uniform(0, 2*np.pi))
             for n, c in orig.items()}

f_orig = reconstruct(orig)
f_scram = reconstruct(scrambled)

# Compare: same power spectrum, different waveform
print(f'Original signal std:   {np.std(f_orig):.4f}')
print(f'Scrambled signal std:  {np.std(f_scram):.4f}')
print(f'Same power? {np.isclose(np.mean(f_orig**2), np.mean(f_scram**2), rtol=0.05)}')
print(f'Same signal? {np.isclose(f_orig, f_scram, atol=0.1).mean():.1%} of points match')
print('PASS - same amplitude spectrum, completely different waveform')

Code cell 48

# === 17.2 Visualise amplitude vs phase scrambling ===
if HAS_MPL:
    x_plot = x[:2000]; f_true = np.sign(np.sin(x_plot))
    f_o = f_orig[:2000]; f_s = f_scram[:2000]
    fig, axes = plt.subplots(1,3, figsize=(15,4))
    for ax, fv, name in zip(axes,
            [f_true, f_o, f_s],
            ['True square wave','Reconstructed (N=50)','Phase-scrambled']):
        ax.plot(x_plot, fv, color=COLORS['primary'], lw=1.2)
        ax.set_title(name); ax.set_xlabel('$x$'); ax.set_ylim(-2,2)
    fig.suptitle('Phase matters: same amplitudes, scrambled phases = random signal', fontsize=13)
    fig.tight_layout(); plt.show()

print("audit output: 17.2 Visualise amplitude vs phase scrambling === complete or optional branch skipped.")

18. Basel Problem Generalisation via Parseval

Parseval's theorem turns Fourier series computations into evaluations of classical number-theory series. We compute several in one go.

Code cell 50

# === 18.1 Series evaluations via Parseval ===
# sum 1/n^2 = pi^2/6    (from f=x)
# sum 1/n^4 = pi^4/90   (from f=x^2)
# sum 1/(2k+1)^2 = pi^2/8 (from square wave)

N_sum = 100000
ns = np.arange(1, N_sum+1)

s2  = np.sum(1/ns**2)
s4  = np.sum(1/ns**4)
s_odd = np.sum(1/(2*ns-1)**2)

print('Series evaluations via Parseval:')
print(f'  sum 1/n^2      = {s2:.8f}  (pi^2/6  = {np.pi**2/6:.8f})')
print(f'  sum 1/n^4      = {s4:.8f}  (pi^4/90 = {np.pi**4/90:.8f})')
print(f'  sum 1/(2k+1)^2 = {s_odd:.8f}  (pi^2/8  = {np.pi**2/8:.8f})')

for val, exact in [(s2, np.pi**2/6), (s4, np.pi**4/90), (s_odd, np.pi**2/8)]:
    print(f'  PASS' if np.isclose(val, exact, rtol=1e-3) else f'  FAIL: {val} vs {exact}')

19. Fourier Series and Convolution

The convolution of two periodic functions $f, g$ :

h(x) = (f * g)(x) = \frac{1}{2\pi}\int_{-\pi}^\pi f(t)g(x-t)\,dt

has Fourier coefficients $c_n[h] = c_n[f]\cdot c_n[g]$ . Convolution in time ↔ multiplication in frequency. (Full treatment: §20-04 Convolution Theorem.)

Code cell 52

# === 19.1 Convolution theorem for Fourier series ===
N_pts = 10000
x = np.linspace(-np.pi, np.pi, N_pts, endpoint=False)
dx = 2*np.pi/N_pts
def c_n(fv, n): return np.sum(fv*np.exp(-1j*n*x))*dx/(2*np.pi)

f = np.cos(2*x) + 0.5*np.sin(3*x)
g = np.sin(x)   + 0.3*np.cos(2*x)

# Direct convolution (expensive O(N^2) via outer product)
# Use np.convolve on the periodic signal
h_direct = np.fft.ifft(np.fft.fft(f) * np.fft.fft(g)).real / N_pts * (2*np.pi)

# Verify: c_n[h] = c_n[f] * c_n[g]
print('Convolution theorem: c_n[f*g] = c_n[f] * c_n[g]')
for n in [1, 2, 3, 4, 5]:
    lhs = c_n(h_direct, n)
    rhs = c_n(f, n) * c_n(g, n)
    ok = np.isclose(lhs, rhs, atol=1e-4)
    print(f'  n={n}: {"PASS" if ok else "FAIL"}  c_n[h]={lhs:.5f}  c_n[f]*c_n[g]={rhs:.5f}')

Summary: Complete Reference

Concept	Formula	AI Application
Fourier coefficient	$c_n = \frac{1}{2\pi}\int f\,e^{-inx}\,dx$	PE frequencies
Orthonormality	$\langle e^{inx},e^{imx}\rangle = \delta_{nm}$	Basis of $L^2$
Parseval	$\sum	c_n
Shift	$c_n[f(\cdot-x_0)] = e^{-inx_0}c_n[f]$	RoPE relative position
Differentiation	$c_n[f'] = in\cdot c_n[f]$	Spectral ODEs
Gibbs	9% overshoot, persists	Anti-aliasing, windowing
Spectral decay	$	c_n
Convolution	$c_n[f*g] = c_n[f]\cdot c_n[g]$	CNNs, SSMs
RoPE	$q_m = R_m q$ , score $(m,n)=f(m-n)$	LLaMA-3, Mistral
RFF	$k\approx\phi^\top\phi$	Scalable kernels

Next: §20-02 Fourier Transform

Fourier Series