Theory Notebook

Converted from theory.ipynb for web reading.

Fourier Transform — Theory Notebook

"The Fourier transform is a mathematical prism: it refracts a signal into its constituent frequencies, revealing structure invisible in time."

This notebook provides interactive derivations, visualizations, and numerical experiments covering all major topics from the §20-02 Fourier Transform notes. Run cells top-to-bottom.

Sections covered:

Setup and imports
From Fourier Series to Fourier Transform (T→∞ limit)
Computing standard Fourier Transforms
Core properties (shift, scaling, differentiation)
Fourier Inversion Theorem
Plancherel's Theorem and energy conservation
Heisenberg Uncertainty Principle
Tempered distributions: Dirac delta and Dirac comb
Applications: FNet, Random Fourier Features, Spectral Normalization, RoPE

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import numpy.linalg as la
from scipy import fft as sp_fft
from scipy import signal as sp_signal
from scipy.special import erf

try:
    import matplotlib.pyplot as plt
    import matplotlib as mpl
    try:
        import seaborn as sns
        sns.set_theme(style='whitegrid', palette='colorblind')
        HAS_SNS = True
    except ImportError:
        plt.style.use('seaborn-v0_8-whitegrid')
        HAS_SNS = False
    mpl.rcParams.update({
        'figure.figsize': (10, 6), 'figure.dpi': 120,
        'font.size': 13, 'axes.titlesize': 15,
        'axes.labelsize': 13, 'lines.linewidth': 2.0,
        'axes.spines.top': False, 'axes.spines.right': False,
    })
    COLORS = {
        'primary': '#0077BB', 'secondary': '#EE7733',
        'tertiary': '#009988', 'error': '#CC3311',
        'neutral': '#555555', 'highlight': '#EE3377',
    }
    HAS_MPL = True
except ImportError:
    HAS_MPL = False

np.set_printoptions(precision=6, suppress=True)
np.random.seed(42)
print('Setup complete. HAS_MPL =', HAS_MPL)

1. From Fourier Series to Fourier Transform

1.1 The T→∞ Limit Argument

We visualize how a Fourier series with period $2L$ becomes the Fourier Transform as $L \to \infty$ . The discrete spectrum (spikes at harmonics $n/(2L)$ ) becomes a continuous function.

Code cell 5

# === 1.1 From Fourier Series to Fourier Transform ===
# Visualize: Fourier series spectrum of rect pulse as period L grows

def fourier_coeffs_rect(L, N_harmonics=50):
    """Fourier coefficients c_n for rect[-0.5, 0.5] in [-L, L] period."""
    ns = np.arange(-N_harmonics, N_harmonics+1)
    freqs = ns / (2*L)  # frequencies xi_n = n/(2L)
    # c_n = (1/2L) * integral_{-0.5}^{0.5} e^{-2pi*i*n*t/2L} dt
    with np.errstate(divide='ignore', invalid='ignore'):
        coeffs = np.where(
            ns == 0,
            1.0 / (2*L),  # c_0 = 1/(2L)
            np.sin(np.pi * ns / (2*L)) / (np.pi * ns)  # sinc(n/(2L))
        )
    return freqs, np.abs(coeffs) * 2 * L  # scaled to show spectrum density

if HAS_MPL:
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    fig.suptitle(r'Fourier Series $\to$ Fourier Transform as $L \to \infty$',
                 fontsize=15)

    # Continuous FT of rect: sinc function
    xi_cont = np.linspace(-4, 4, 1000)
    ft_cont = np.sinc(xi_cont)  # = sin(pi*xi)/(pi*xi)

    for ax, L in zip(axes, [1, 3, 10]):
        freqs, amp = fourier_coeffs_rect(L, N_harmonics=60)
        ax.plot(xi_cont, ft_cont, color=COLORS['neutral'],
                linewidth=1.5, linestyle='--', label=r'$\widehat{\rm rect}(\xi)=\rm sinc(\xi)$',
                zorder=5)
        ax.vlines(freqs, 0, amp, color=COLORS['primary'], linewidth=1.0,
                  label=f'Fourier coeffs, L={L}')
        ax.set_xlim(-4, 4)
        ax.set_ylim(-0.3, 1.1)
        ax.set_xlabel(r'Frequency $\xi$')
        ax.set_ylabel('Amplitude')
        ax.set_title(f'Period $2L={2*L}$')
        ax.legend(fontsize=9)

    fig.tight_layout()
    plt.show()
    print('As L grows, discrete spectrum -> continuous sinc: FT emerges.')
else:
    print('matplotlib not available; skipping plot.')

2. Computing Standard Fourier Transforms

2.1 The Four Standard Examples

We compute and visualize the Fourier Transforms of:

Rectangular pulse → sinc function
Gaussian → Gaussian (self-dual!)
Double-sided exponential → Lorentzian
One-sided exponential → complex Lorentzian

Code cell 7

# === 2.1 Standard Fourier Transform Examples ===
# Numerical FT via scipy.fft

def numerical_ft(f_vals, t_vals):
    """Approximate FT via trapezoidal rule on a uniform grid."""
    dt = t_vals[1] - t_vals[0]
    N = len(t_vals)
    # Use FFT for efficiency
    F = sp_fft.fftshift(sp_fft.fft(sp_fft.ifftshift(f_vals))) * dt
    xi = sp_fft.fftshift(sp_fft.fftfreq(N, d=dt))
    return xi, F

# Common time grid
t = np.linspace(-10, 10, 4096)
dt = t[1] - t[0]

# 1. Rectangular pulse rect(t)
rect = ((np.abs(t) <= 0.5)).astype(float)
xi_rect, F_rect = numerical_ft(rect, t)
sinc_theory = np.sinc(xi_rect)  # normalized sinc = sin(pi*xi)/(pi*xi)

# 2. Gaussian e^{-pi*t^2} (self-dual)
gauss = np.exp(-np.pi * t**2)
xi_gauss, F_gauss = numerical_ft(gauss, t)
gauss_theory = np.exp(-np.pi * xi_gauss**2)

# 3. Double-sided exponential e^{-|t|}
a = 1.0
dsexp = np.exp(-a * np.abs(t))
xi_ds, F_ds = numerical_ft(dsexp, t)
lorentz_theory = 2*a / (a**2 + (2*np.pi*xi_ds)**2)

# 4. One-sided exponential e^{-t} for t>=0
onesided = np.where(t >= 0, np.exp(-a*t), 0.0)
xi_os, F_os = numerical_ft(onesided, t)
os_theory_mag = 1.0 / np.sqrt(a**2 + (2*np.pi*xi_os)**2)

print('Transforms computed numerically.')
print(f'Max error (rect->sinc):   {np.max(np.abs(np.real(F_rect[np.abs(xi_rect)<4]) - sinc_theory[np.abs(xi_rect)<4])):.4f}')
print(f'Max error (Gauss->Gauss): {np.max(np.abs(np.real(F_gauss[np.abs(xi_gauss)<4]) - gauss_theory[np.abs(xi_gauss)<4])):.4f}')
print(f'Max error (dsexp->Lorentz): {np.max(np.abs(np.real(F_ds[np.abs(xi_ds)<4]) - lorentz_theory[np.abs(xi_ds)<4])):.4f}')

Code cell 8

# === 2.1 Plot standard FT examples ===
if HAS_MPL:
    fig, axes = plt.subplots(2, 4, figsize=(16, 8))
    fig.suptitle('Standard Fourier Transform Pairs', fontsize=15)

    examples = [
        (t, rect, xi_rect, F_rect, sinc_theory, r'$f(t)=\mathrm{rect}(t)$',
         r'$\hat{f}(\xi)=\mathrm{sinc}(\xi)$', (-2,2), (-2,2)),
        (t, gauss, xi_gauss, F_gauss, gauss_theory, r'$f(t)=e^{-\pi t^2}$',
         r'$\hat{f}(\xi)=e^{-\pi\xi^2}$ (self-dual)', (-3,3), (-3,3)),
        (t, dsexp, xi_ds, F_ds, lorentz_theory, r'$f(t)=e^{-|t|}$',
         r'$\hat{f}(\xi)=\frac{2}{1+(2\pi\xi)^2}$', (-5,5), (-4,4)),
        (t, onesided, xi_os, F_os, os_theory_mag, r'$f(t)=e^{-t}\mathbf{1}_{t\geq0}$',
         r'$|\hat{f}(\xi)|=1/\sqrt{1+(2\pi\xi)^2}$', (-2,5), (-4,4)),
    ]

    for col, (tv, fv, xiv, Fv, Fth, tlabel, flabel, tlim, flim) in enumerate(examples):
        ax_t = axes[0, col]
        ax_f = axes[1, col]

        mask_t = (tv >= tlim[0]) & (tv <= tlim[1])
        ax_t.plot(tv[mask_t], fv[mask_t], color=COLORS['primary'])
        ax_t.set_title(tlabel)
        ax_t.set_xlabel('Time $t$')
        ax_t.set_ylabel('$f(t)$')

        mask_f = (xiv >= flim[0]) & (xiv <= flim[1])
        if col == 3:  # one-sided: plot magnitude
            ax_f.plot(xiv[mask_f], np.abs(Fv[mask_f]),
                      color=COLORS['primary'], label='Numerical')
            ax_f.plot(xiv[mask_f], Fth[mask_f],
                      color=COLORS['secondary'], linestyle='--', label='Theory')
        else:
            ax_f.plot(xiv[mask_f], np.real(Fv[mask_f]),
                      color=COLORS['primary'], label='Numerical')
            ax_f.plot(xiv[mask_f], Fth[mask_f],
                      color=COLORS['secondary'], linestyle='--', label='Theory')
        ax_f.set_title(flabel)
        ax_f.set_xlabel(r'Frequency $\xi$')
        ax_f.set_ylabel(r'$\hat{f}(\xi)$')
        ax_f.legend(fontsize=9)

    fig.tight_layout()
    plt.show()
    print('All four standard FT pairs match theory.')

2.2 Convention Comparison: $\xi$ vs $\omega$

The three FT conventions differ only in normalization. The $\xi$ -convention (used here) is symmetric and places no $2\pi$ in the inverse. Below we verify the conversion formula: $F_\omega(\omega) = F_\xi(\omega/2\pi)$ .

Code cell 10

# === 2.2 Convention comparison ===
# For f(t) = e^{-pi*t^2}, compare xi and omega conventions

t_fine = np.linspace(-8, 8, 8192)
f_gauss = np.exp(-np.pi * t_fine**2)

# xi-convention: F_xi(xi) = int f(t) e^{-2pi*i*xi*t} dt = e^{-pi*xi^2}
xi_vals, F_xi = numerical_ft(f_gauss, t_fine)

# omega-convention (no 1/2pi): F_omega(omega) = int f(t) e^{-i*omega*t} dt
# Relationship: F_omega(omega) = F_xi(omega/(2*pi))
# Also: F_omega = sqrt(2pi) * e^{-omega^2/2} for this Gaussian

omega_vals = 2 * np.pi * xi_vals
F_omega_theory = np.sqrt(2*np.pi) * np.exp(-omega_vals**2 / 2)

# Verify: F_xi(xi) at xi should equal F_omega(omega=2*pi*xi) / 1
# F_xi(xi) = e^{-pi*xi^2}; F_omega(2pi*xi) = sqrt(2pi)*e^{-2pi^2*xi^2}
# These differ by factor sqrt(2pi)*e^{(pi^2-pi)*xi^2}... let's just check numerically
mask = np.abs(xi_vals) < 3
F_xi_theory = np.exp(-np.pi * xi_vals**2)

err_xi = np.max(np.abs(np.real(F_xi[mask]) - F_xi_theory[mask]))
print(f'xi-convention: max error = {err_xi:.2e}')

# Parseval check in xi-convention: integral |f|^2 dt = integral |F_xi|^2 dxi
dt = t_fine[1] - t_fine[0]
dxi = xi_vals[1] - xi_vals[0]
energy_time = np.sum(f_gauss**2) * dt
energy_freq = np.sum(np.abs(F_xi)**2) * dxi
print(f'Energy in time:  {energy_time:.6f}')
print(f'Energy in freq:  {energy_freq:.6f}')
print(f'Parseval error:  {abs(energy_time - energy_freq):.2e}')

ok = abs(energy_time - energy_freq) < 1e-3
print(f"{'PASS' if ok else 'FAIL'} - Parseval identity verified numerically")

3. Core Properties of the Fourier Transform

3.1 Time-Shift and Modulation

The time-shift property: $\mathcal{F}[f(t-a)](\xi) = e^{-2\pi i\xi a}\hat{f}(\xi)$ . Shifting in time leaves magnitude unchanged but adds linear phase.

Code cell 12

# === 3.1 Time-Shift Property ===
import numpy as np
from scipy import fft as sp_fft
try:
    import matplotlib.pyplot as plt
    import matplotlib as mpl
    try:
        import seaborn as sns
        sns.set_theme(style='whitegrid', palette='colorblind')
        HAS_SNS = True
    except ImportError:
        plt.style.use('seaborn-v0_8-whitegrid')
        HAS_SNS = False
    mpl.rcParams.update({'figure.figsize': (10, 6), 'figure.dpi': 120,
                          'font.size': 13, 'axes.titlesize': 14,
                          'axes.spines.top': False, 'axes.spines.right': False})
    COLORS = {'primary': '#0077BB', 'secondary': '#EE7733',
              'tertiary': '#009988', 'error': '#CC3311',
              'neutral': '#555555', 'highlight': '#EE3377'}
    HAS_MPL = True
except ImportError:
    HAS_MPL = False
np.random.seed(42)

def numerical_ft(f_vals, t_vals):
    dt = t_vals[1] - t_vals[0]
    N = len(t_vals)
    F = sp_fft.fftshift(sp_fft.fft(sp_fft.ifftshift(f_vals))) * dt
    xi = sp_fft.fftshift(sp_fft.fftfreq(N, d=dt))
    return xi, F

t = np.linspace(-10, 10, 4096)
a = 2.0  # time shift

# Gaussian pulse and its shifted version
f = np.exp(-np.pi * t**2)
f_shifted = np.exp(-np.pi * (t - a)**2)

xi, F = numerical_ft(f, t)
xi_s, F_s = numerical_ft(f_shifted, t)

# Theory: F_shifted = e^{-2pi*i*xi*a} * F
F_s_theory = np.exp(-2j * np.pi * xi * a) * F

mask = np.abs(xi) < 3
mag_err = np.max(np.abs(np.abs(F_s[mask]) - np.abs(F_s_theory[mask])))
phase_err = np.max(np.abs(np.angle(F_s[mask]) - np.angle(F_s_theory[mask])))

print(f'Time shift a = {a}')
print(f'Magnitude change: max|mag_shift - mag_original| = {np.max(np.abs(np.abs(F_s[mask]) - np.abs(F[mask]))):.2e}')
print(f'  -> CONFIRMED: time shift does NOT change magnitude spectrum')
print(f'Phase theory error: {phase_err:.2e}')
ok = phase_err < 0.01
print(f"{'PASS' if ok else 'FAIL'} - shift property: F[f(t-a)](xi) = e^(-2pi*i*xi*a)*F(xi)")

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    fig.suptitle('Time-Shift Property: Magnitude Unchanged, Phase Linear')

    ax = axes[0]
    ax.plot(xi[mask], np.abs(F[mask]), color=COLORS['primary'], label='$|\\hat{f}(\\xi)|$ (original)')
    ax.plot(xi[mask], np.abs(F_s[mask]), color=COLORS['secondary'],
            linestyle='--', label=f'$|\\hat{{f_{{a}}}}(\\xi)|$, a={a}')
    ax.set_xlabel(r'Frequency $\xi$')
    ax.set_ylabel('Magnitude')
    ax.set_title('Magnitude spectrum (identical)')
    ax.legend()

    ax = axes[1]
    ax.plot(xi[mask], np.unwrap(np.angle(F_s[mask])), color=COLORS['secondary'],
            label='Numerical phase (shifted)')
    ax.plot(xi[mask], -2*np.pi*xi[mask]*a, color=COLORS['error'],
            linestyle=':', linewidth=2.5, label=f'Theory: $-2\\pi\\xi a$ = $-2\\pi\\xi \\cdot {a}$')
    ax.set_xlabel(r'Frequency $\xi$')
    ax.set_ylabel('Phase (radians)')
    ax.set_title('Phase spectrum (linear in xi)')
    ax.legend()

    fig.tight_layout()
    plt.show()

3.2 Scaling / Dilation Property

The scaling theorem: $\mathcal{F}[f(at)](\xi) = \frac{1}{|a|}\hat{f}(\xi/a)$ . Time compression ( $|a|>1$ ) stretches the spectrum; time dilation compresses it. This is the time-bandwidth product in action.

Code cell 14

# === 3.2 Scaling Property ===
# Show time compression <-> spectrum expansion

t = np.linspace(-10, 10, 8192)
f_base = np.exp(-np.pi * t**2)  # Gaussian, width 1

a_vals = [0.5, 1.0, 2.0, 4.0]  # compression factors
results = {}
for a in a_vals:
    f_scaled = np.exp(-np.pi * (a*t)**2)  # f(at)
    xi, F = numerical_ft(f_scaled, t)
    # Theory: (1/|a|) * F_base(xi/a) = (1/|a|) * e^{-pi*(xi/a)^2}
    F_theory = (1/abs(a)) * np.exp(-np.pi * (xi/a)**2)
    mask = np.abs(xi) < 6
    err = np.max(np.abs(np.real(F[mask]) - F_theory[mask]))
    # Measure effective bandwidth (width at half-max)
    mag = np.abs(F)
    bw = np.sum(xi[mask][mag[mask] > 0.5*mag.max()]) if mag.max() > 0 else 0
    results[a] = {'F': F, 'xi': xi, 'F_theory': F_theory, 'err': err}
    print(f'a={a:.1f}: peak at xi=0 = {np.abs(F[len(F)//2]):.4f} (theory: {1/a:.4f}), error = {err:.2e}')

print()
print('Time-bandwidth product: (width in time) x (width in freq) = constant')
for a in a_vals:
    sigma_t = 1/a  # Gaussian width in time (sigma_t = 1/a)
    sigma_f = a    # Gaussian width in freq
    print(f'  a={a:.1f}: sigma_t={sigma_t:.2f}, sigma_f={sigma_f:.2f}, product={sigma_t*sigma_f:.2f}')

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    fig.suptitle(r'Scaling: $f(at) \leftrightarrow \frac{1}{|a|}\hat{f}(\xi/a)$')

    for a in a_vals:
        f_scaled = np.exp(-np.pi * (a*t)**2)
        mask_t = np.abs(t) < 3
        axes[0].plot(t[mask_t], f_scaled[mask_t], label=f'$f({a}t)$')

    axes[0].set_xlabel('Time $t$')
    axes[0].set_ylabel('$f(at)$')
    axes[0].set_title('Time domain: narrow for large $a$')
    axes[0].legend()

    for a in a_vals:
        xi_r, F_r = results[a]['xi'], results[a]['F']
        mask_f = np.abs(xi_r) < 6
        axes[1].plot(xi_r[mask_f], np.abs(F_r[mask_f]), label=f'$a={a}$')

    axes[1].set_xlabel(r'Frequency $\xi$')
    axes[1].set_ylabel(r'$|\hat{f}(\xi)|$')
    axes[1].set_title('Freq domain: wide for large $a$ (time-bandwidth tradeoff)')
    axes[1].legend()

    fig.tight_layout()
    plt.show()

3.3 Differentiation Property

Differentiation in time corresponds to multiplication by $2\pi i\xi$ in frequency. This converts ODEs into algebraic equations and explains why smooth functions have rapidly decaying spectra.

Code cell 16

# === 3.3 Differentiation Property ===
# Verify: FT[f'(t)](xi) = 2*pi*i*xi * FT[f](xi)

t = np.linspace(-10, 10, 8192)
dt = t[1] - t[0]

# Use Gaussian: f'(t) = -2*pi*t*e^{-pi*t^2}
f = np.exp(-np.pi * t**2)
f_prime = -2*np.pi*t * f  # exact derivative

xi, F = numerical_ft(f, t)
xi2, F_prime = numerical_ft(f_prime, t)

# Theory: FT[f'](xi) = 2*pi*i*xi * FT[f](xi)
F_prime_theory = 2j * np.pi * xi * F

mask = np.abs(xi) < 4
err = np.max(np.abs(F_prime[mask] - F_prime_theory[mask]))
print(f'Differentiation property error: {err:.2e}')
ok = err < 0.01
print(f"{'PASS' if ok else 'FAIL'} - FT[f\'] = 2*pi*i*xi * FT[f]")

# Smoothness vs decay: compare FT of rect (jump) vs triangle (continuous)
rect_pulse = (np.abs(t) <= 0.5).astype(float)
tri_pulse = np.maximum(1 - np.abs(t), 0)
xi_r, F_rect = numerical_ft(rect_pulse, t)
xi_t, F_tri = numerical_ft(tri_pulse, t)

print('\nSpectral decay comparison:')
xi_ref = np.array([1, 2, 3, 5, 10])
for xr in xi_ref:
    idx_r = np.argmin(np.abs(xi_r - xr))
    idx_t = np.argmin(np.abs(xi_t - xr))
    print(f'  xi={xr}: |rect_FT|={np.abs(F_rect[idx_r]):.4f}, |tri_FT|={np.abs(F_tri[idx_t]):.4f}')
    print(f'         (theory: ~{1/xr:.4f} vs ~{1/xr**2:.4f})')

4. Fourier Inversion Theorem

4.1 Numerical Verification of Inversion

We verify that $f(t) = \int \hat{f}(\xi)e^{2\pi i\xi t}\,d\xi$ recovers $f$ for both smooth and piecewise-smooth functions, paying attention to the Gibbs-like behavior at discontinuities.

Code cell 18

# === 4.1 Fourier Inversion Theorem ===
t = np.linspace(-10, 10, 8192)

def ft_and_invert(f, t):
    dt = t[1] - t[0]
    N = len(t)
    # Forward FT
    F = sp_fft.fftshift(sp_fft.fft(sp_fft.ifftshift(f))) * dt
    xi = sp_fft.fftshift(sp_fft.fftfreq(N, d=dt))
    # Inverse FT: f_rec = int F(xi) e^{2pi*i*xi*t} dxi
    dxi = xi[1] - xi[0]
    f_rec = sp_fft.fftshift(sp_fft.ifft(sp_fft.ifftshift(F/dt))) / dxi * dxi
    # Simpler: use scipy.fft inverse directly
    f_rec2 = np.real(sp_fft.fftshift(sp_fft.ifft(sp_fft.ifftshift(F))) / dt)
    return xi, F, f_rec2

# Test 1: Gaussian (smooth)
f_gauss = np.exp(-np.pi * t**2)
_, F_gauss, f_gauss_rec = ft_and_invert(f_gauss, t)
err_gauss = np.max(np.abs(f_gauss_rec - f_gauss))
print(f'Gaussian recovery error: {err_gauss:.2e}')
ok_g = err_gauss < 0.001
print(f"{'PASS' if ok_g else 'FAIL'} - Gaussian inversion")

# Test 2: Rectangular pulse (jump discontinuities)
f_rect = (np.abs(t) <= 1.0).astype(float)
_, F_rect, f_rect_rec = ft_and_invert(f_rect, t)
# At interior points, should recover well; at jumps t=+/-1, should get ~0.5
err_interior = np.max(np.abs(f_rect_rec[(np.abs(t) < 0.9)] - 1.0))
val_at_jump = f_rect_rec[np.argmin(np.abs(t - 1.0))]
print(f'\nRect: interior recovery error: {err_interior:.4f}')
print(f'Rect: value at jump t=1: {val_at_jump:.4f} (theory: 0.5)')
ok_j = abs(val_at_jump - 0.5) < 0.15
print(f"{'PASS' if ok_j else 'FAIL'} - Inversion averages at jump discontinuity")

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    fig.suptitle('Fourier Inversion: $f = \\mathcal{F}^{-1}[\\hat{f}]$')

    mask = np.abs(t) < 3
    axes[0].plot(t[mask], f_gauss[mask], color=COLORS['primary'], linewidth=2, label='Original')
    axes[0].plot(t[mask], f_gauss_rec[mask], color=COLORS['secondary'],
                 linestyle='--', linewidth=1.5, label='Recovered')
    axes[0].set_title('Gaussian (smooth): perfect recovery')
    axes[0].set_xlabel('$t$'); axes[0].set_ylabel('$f(t)$')
    axes[0].legend()

    mask2 = np.abs(t) < 2.5
    axes[1].plot(t[mask2], f_rect[mask2], color=COLORS['primary'], linewidth=2, label='Original')
    axes[1].plot(t[mask2], f_rect_rec[mask2], color=COLORS['secondary'],
                 linestyle='--', linewidth=1.5, label='Recovered (Gibbs at jumps)')
    axes[1].axhline(0.5, color=COLORS['neutral'], linestyle=':', linewidth=1, label='0.5 = avg at jump')
    axes[1].set_title('Rect (jump discontinuity): Gibbs phenomenon')
    axes[1].set_xlabel('$t$'); axes[1].set_ylabel('$f(t)$')
    axes[1].legend()

    fig.tight_layout()
    plt.show()

4.2 The Self-Dual Gaussian

The Gaussian $g(t) = e^{-\pi t^2}$ satisfies $\hat{g} = g$ . We verify this and also show the general scaled Gaussian family: $\mathcal{F}[e^{-\pi t^2/\sigma^2}](\xi) = \sigma e^{-\pi\sigma^2\xi^2}$ .

Code cell 20

# === 4.2 Self-dual Gaussian and scaling ===
t = np.linspace(-10, 10, 8192)

sigma_vals = [0.5, 1.0, 2.0, 4.0]
print('Self-dual Gaussian family: FT[e^{-pi*t^2/sigma^2}](xi) = sigma * e^{-pi*sigma^2*xi^2}')
print(f"{'sigma':>8} {'time_peak':>12} {'freq_peak (num)':>18} {'freq_peak (theory)':>20}")
print('-' * 65)
for sigma in sigma_vals:
    f = np.exp(-np.pi * t**2 / sigma**2)
    xi, F = numerical_ft(f, t)
    freq_peak = np.max(np.real(F))
    theory_peak = sigma  # peak at xi=0 is sigma
    print(f"{sigma:>8.1f} {1.0:>12.4f} {freq_peak:>18.4f} {theory_peak:>20.4f}")

# Special case sigma=1: self-dual
g = np.exp(-np.pi * t**2)
xi, G = numerical_ft(g, t)
mask_self = np.abs(xi) < 4
self_dual_err = np.max(np.abs(np.real(G[mask_self]) - np.exp(-np.pi * xi[mask_self]**2)))
print(f'\nSelf-duality error for sigma=1: {self_dual_err:.2e}')
ok = self_dual_err < 0.001
print(f"{'PASS' if ok else 'FAIL'} - Gaussian is self-dual: FT[e^(-pi*t^2)] = e^(-pi*xi^2)")

# Apply FT four times: should return to original
F1 = sp_fft.fftshift(sp_fft.fft(sp_fft.ifftshift(g))) * (t[1]-t[0])
F2_in = np.real(F1)  # F1 = g (self-dual)
F2 = sp_fft.fftshift(sp_fft.fft(sp_fft.ifftshift(F2_in))) * (t[1]-t[0])
F3 = sp_fft.fftshift(sp_fft.fft(sp_fft.ifftshift(np.real(F2)))) * (t[1]-t[0])
F4 = sp_fft.fftshift(sp_fft.fft(sp_fft.ifftshift(np.real(F3)))) * (t[1]-t[0])
four_ft_err = np.max(np.abs(np.real(F4[np.abs(t)<4]) - g[np.abs(t)<4]))
print(f'FT^4[g] == g error: {four_ft_err:.2e}')
ok2 = four_ft_err < 0.01
print(f"{'PASS' if ok2 else 'FAIL'} - Applying FT 4 times returns original (FT has order 4)")

5. Plancherel's Theorem and Energy Conservation

Plancherel's theorem: $\|\hat{f}\|_2 = \|f\|_2$ — the Fourier Transform is a unitary isometry on $L^2(\mathbb{R})$ . Energy is perfectly conserved.

Code cell 22

# === 5.1 Plancherel's Theorem — Numerical Verification ===
import numpy as np
from scipy import fft as sp_fft
try:
    import matplotlib.pyplot as plt
    import matplotlib as mpl
    mpl.rcParams.update({'figure.figsize': (10, 6), 'figure.dpi': 120,
                          'axes.spines.top': False, 'axes.spines.right': False})
    COLORS = {'primary': '#0077BB', 'secondary': '#EE7733',
              'tertiary': '#009988', 'error': '#CC3311',
              'neutral': '#555555', 'highlight': '#EE3377'}
    HAS_MPL = True
except ImportError:
    HAS_MPL = False
np.random.seed(42)

def numerical_ft(f_vals, t_vals):
    dt = t_vals[1] - t_vals[0]
    N = len(t_vals)
    F = sp_fft.fftshift(sp_fft.fft(sp_fft.ifftshift(f_vals))) * dt
    xi = sp_fft.fftshift(sp_fft.fftfreq(N, d=dt))
    return xi, F

t = np.linspace(-15, 15, 8192)
dt = t[1] - t[0]

test_functions = [
    ('Gaussian e^{-pi*t^2}', np.exp(-np.pi * t**2)),
    ('Tri pulse max(1-|t|,0)', np.maximum(1 - np.abs(t), 0)),
    ('Decaying sinusoid', np.exp(-np.abs(t)) * np.cos(2*np.pi*t)),
    ('Narrow Gaussian e^{-pi*(2t)^2}', np.exp(-np.pi * (2*t)**2)),
]

print(f"{'Function':40s} {'||f||_2^2':>12} {'||F||_2^2':>12} {'Parseval err':>14}")
print('-' * 82)
for name, f in test_functions:
    xi, F = numerical_ft(f, t)
    dxi = xi[1] - xi[0]
    energy_t = np.sum(np.abs(f)**2) * dt
    energy_f = np.sum(np.abs(F)**2) * dxi
    err = abs(energy_t - energy_f) / energy_t
    print(f"{name:40s} {energy_t:>12.6f} {energy_f:>12.6f} {err:>14.2e}")

print()
print('PASS' if all(abs(np.sum(np.abs(f)**2)*dt - np.sum(np.abs(numerical_ft(f,t)[1])**2)*(numerical_ft(f,t)[0][1]-numerical_ft(f,t)[0][0]))/max(np.sum(np.abs(f)**2)*dt,1e-10) < 0.001 for _, f in test_functions) else 'FAIL', '- Parseval: ||f||^2 = ||F||^2 for all test functions')

Code cell 23

# === 5.2 Using Parseval to Evaluate Integrals ===
# Example: compute integral of sinc^2(xi) d xi using Parseval
# Since FT[rect](xi) = sinc(xi), Parseval gives:
# integral |sinc(xi)|^2 dxi = integral |rect(t)|^2 dt = integral_{-0.5}^{0.5} 1 dt = 1

t = np.linspace(-10, 10, 16384)
dt = t[1] - t[0]

rect = (np.abs(t) <= 0.5).astype(float)
xi, F_rect = numerical_ft(rect, t)
dxi = xi[1] - xi[0]

# Direct computation of integral sinc^2(xi) dxi
sinc_sq_integral = np.sum(np.sinc(xi)**2) * dxi

# Via Parseval: = integral |rect(t)|^2 dt
rect_sq_integral = np.sum(rect**2) * dt

print('Computing integral_{-inf}^{inf} sinc^2(xi) dxi using Parseval:')
print(f'Direct sinc^2 integral: {sinc_sq_integral:.6f}')
print(f'Via Parseval (rect norm): {rect_sq_integral:.6f}')
print(f'Theory (= 1): 1.000000')
ok = abs(sinc_sq_integral - 1.0) < 0.002
print(f"{'PASS' if ok else 'FAIL'} - integral sinc^2(xi)dxi = 1 via Parseval")

# Second example: integral 1/(1+4*pi^2*t^2)^2 dt
# FT[1/(1+(2pi*t)^2)](xi) = (1/2)*e^{-|xi|}
# Parseval: integral |f|^2 dt = integral |F|^2 dxi
# integral 1/(1+(2pi*t)^2)^2 dt = integral (1/4)*e^{-2|xi|} dxi = (1/4)*integral e^{-2|xi|}dxi = (1/4)*(1) = 1/4
f_lorentz = 1.0 / (1 + (2*np.pi*t)**2)
lorentz_sq = np.sum(f_lorentz**2) * dt
theory_val = 1/(4*2)  # integral (1/4)*2*e^{-2xi}dxi for xi>0 = (1/4)*(1/2+1/2) from -inf to inf
# Actually: integral_{-inf}^{inf} (1/4)*e^{-2|xi|} dxi = (1/4)*2*integral_0^inf e^{-2xi}dxi = (1/4)*(1) = 1/4
print(f'\ninteg 1/(1+(2pi*t)^2)^2 dt = {lorentz_sq:.6f} (theory: {1/4:.6f})')
ok2 = abs(lorentz_sq - 0.25) < 0.005
print(f"{'PASS' if ok2 else 'FAIL'} - Parseval application to Lorentzian squared")

6. Heisenberg Uncertainty Principle

Theorem: For any $f \in L^2(\mathbb{R})$ normalized to $\|f\|_2 = 1$ :

\Delta t \cdot \Delta\xi \geq \frac{1}{4\pi}

Equality iff $f$ is a Gaussian.

Code cell 25

# === 6.1 Uncertainty Principle — Numerical Verification ===

def compute_uncertainty(f, t):
    """Compute time spread Delta_t and frequency spread Delta_xi."""
    dt = t[1] - t[0]
    xi, F = numerical_ft(f, t)
    dxi = xi[1] - xi[0]

    # Normalize: |f|^2 and |F|^2 as probability densities
    norm_t = np.sum(np.abs(f)**2) * dt
    norm_f = np.sum(np.abs(F)**2) * dxi

    if norm_t < 1e-10:
        return None, None, None

    p_t = np.abs(f)**2 / norm_t  # time probability density
    p_f = np.abs(F)**2 / norm_f  # freq probability density

    # Time center and spread
    mu_t = np.sum(t * p_t) * dt
    Delta_t = np.sqrt(np.sum((t - mu_t)**2 * p_t) * dt)

    # Frequency center and spread
    mu_xi = np.sum(xi * p_f) * dxi
    Delta_xi = np.sqrt(np.sum((xi - mu_xi)**2 * p_f) * dxi)

    return Delta_t, Delta_xi, Delta_t * Delta_xi

t = np.linspace(-20, 20, 16384)
bound = 1 / (4 * np.pi)
print(f'Uncertainty bound: 1/(4*pi) = {bound:.6f}')
print()

test_signals = [
    ('Gaussian sigma=1.0 (extremal)', np.exp(-np.pi * t**2)),
    ('Gaussian sigma=2.0', np.exp(-np.pi * t**2 / 4)),
    ('Gaussian sigma=0.5', np.exp(-np.pi * t**2 * 4)),
    ('Rectangular pulse w=2', (np.abs(t) <= 1.0).astype(float)),
    ('Hann window w=4', np.where(np.abs(t) <= 2, 0.5*(1+np.cos(np.pi*t/2)), 0)),
    ('Decaying cos', np.exp(-t**2/2)*np.cos(2*np.pi*2*t)),
]

print(f"{'Signal':35s} {'Delta_t':>10} {'Delta_xi':>10} {'Product':>10} {'Ratio/bound':>12}")
print('-' * 82)
for name, f in test_signals:
    Dt, Dxi, prod = compute_uncertainty(f, t)
    if Dt is not None:
        ratio = prod / bound
        print(f"{name:35s} {Dt:>10.4f} {Dxi:>10.4f} {prod:>10.4f} {ratio:>12.4f}")
        assert prod >= bound - 0.001, f'Uncertainty principle violated for {name}!'

print()
print('PASS - All signals satisfy Delta_t * Delta_xi >= 1/(4*pi)')
print('The Gaussian achieves the minimum (ratio ~ 1.0)')

Code cell 26

# === 6.2 Uncertainty Principle Visualization ===
# Show the time-frequency tradeoff: wider in time <-> narrower in freq

if HAS_MPL:
    t_viz = np.linspace(-10, 10, 8192)
    sigma_vals = [0.4, 0.7, 1.0, 1.5, 2.5]

    fig, axes = plt.subplots(2, 1, figsize=(12, 8))
    fig.suptitle(r'Uncertainty Principle: $\Delta t \cdot \Delta\xi \geq 1/(4\pi)$', fontsize=15)

    cmap = plt.cm.viridis
    for i, sigma in enumerate(sigma_vals):
        color = cmap(i / len(sigma_vals))
        f = np.exp(-np.pi * t_viz**2 / sigma**2)
        f /= np.sqrt(np.sum(f**2) * (t_viz[1]-t_viz[0]))  # normalize
        xi_v, F_v = numerical_ft(f, t_viz)

        mask_t = np.abs(t_viz) < 5
        mask_f = np.abs(xi_v) < 5

        axes[0].plot(t_viz[mask_t], np.abs(f[mask_t])**2,
                     color=color, label=f'$\\sigma={sigma}$')
        axes[1].plot(xi_v[mask_f], np.abs(F_v[mask_f])**2,
                     color=color, label=f'$\\sigma={sigma}$')

    axes[0].set_xlabel('Time $t$')
    axes[0].set_ylabel(r'$|f(t)|^2$ (probability density)')
    axes[0].set_title('Time domain: wider sigma = more spread')
    axes[0].legend(loc='upper right')

    axes[1].set_xlabel(r'Frequency $\xi$')
    axes[1].set_ylabel(r'$|\hat{f}(\xi)|^2$')
    axes[1].set_title('Freq domain: wider sigma = narrower spectrum (inverse relation)')
    axes[1].legend(loc='upper right')

    fig.tight_layout()
    plt.show()

    # Show product Delta_t * Delta_xi vs sigma
    sigmas = np.linspace(0.2, 4.0, 40)
    products = []
    for sigma in sigmas:
        f = np.exp(-np.pi * t_viz**2 / sigma**2)
        Dt, Dxi, prod = compute_uncertainty(f, t_viz)
        products.append(prod)

    fig2, ax2 = plt.subplots(figsize=(9, 5))
    ax2.plot(sigmas, products, color=COLORS['primary'], linewidth=2.5,
             label=r'$\Delta t \cdot \Delta\xi$ for Gaussian')
    ax2.axhline(bound, color=COLORS['error'], linestyle='--',
                linewidth=2, label=f'Bound $1/(4\\pi) = {bound:.4f}$')
    ax2.set_xlabel(r'Gaussian width $\sigma$')
    ax2.set_ylabel(r'$\Delta t \cdot \Delta\xi$')
    ax2.set_title('All Gaussians achieve the uncertainty bound (product = const)')
    ax2.legend()
    fig2.tight_layout()
    plt.show()

7. Tempered Distributions: Dirac Delta and Dirac Comb

7.1 The Dirac Delta and Its Fourier Transform

The Dirac delta $\delta(t)$ is not a function but a distribution: $\langle\delta,\phi\rangle = \phi(0)$ . Its Fourier Transform: $\mathcal{F}[\delta](\xi) = 1$ (flat spectrum). Conversely: $\mathcal{F}[1](\xi) = \delta(\xi)$ .

Code cell 28

# === 7.1 Dirac Delta: Approximation and FT ===
# Approximate delta(t) by a sequence of narrow Gaussians

t = np.linspace(-5, 5, 8192)

epsilons = [1.0, 0.5, 0.2, 0.1]
print('Delta approximation: delta_eps(t) = (1/eps)*e^{-pi*t^2/eps^2}')
print('FT of delta_eps: e^{-pi*eps^2*xi^2} -> 1 as eps->0 (flat spectrum)')
print()

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(13, 5))
    fig.suptitle('Dirac Delta: $\\delta_\\epsilon(t) \\to \\delta(t)$ and $\\mathcal{F}[\\delta]=1$')

    xi_range = np.linspace(-8, 8, 2000)
    cmap = plt.cm.viridis
    for i, eps in enumerate(epsilons):
        delta_eps = (1/eps) * np.exp(-np.pi * t**2 / eps**2)
        xi_d, F_d = numerical_ft(delta_eps, t)
        color = cmap(i/len(epsilons))

        mask_t = np.abs(t) < 2
        axes[0].plot(t[mask_t], delta_eps[mask_t], color=color, label=f'$\\epsilon={eps}$')

        mask_f = np.abs(xi_d) < 6
        axes[1].plot(xi_d[mask_f], np.real(F_d[mask_f]), color=color, label=f'$\\epsilon={eps}$')

    axes[0].set_title(r'$\delta_\epsilon(t) = \frac{1}{\epsilon}e^{-\pi t^2/\epsilon^2}$')
    axes[0].set_xlabel('$t$'); axes[0].set_ylabel(r'$\delta_\epsilon(t)$')
    axes[0].legend()

    axes[1].axhline(1.0, color=COLORS['error'], linestyle='--', linewidth=2, label='Limit: $\\hat{\\delta}=1$')
    axes[1].set_title(r'FT of $\delta_\epsilon$: $e^{-\pi\epsilon^2\xi^2} \to 1$')
    axes[1].set_xlabel(r'$\xi$'); axes[1].set_ylabel(r'$\mathcal{F}[\delta_\epsilon](\xi)$')
    axes[1].legend()

    fig.tight_layout()
    plt.show()

print('Key facts:')
print('  FT[delta(t)] = 1  (impulse has flat/white spectrum)')
print('  FT[1] = delta(xi) (constant has spectrum only at xi=0)')
print('  FT[e^{2pi*i*xi0*t}] = delta(xi - xi0)  (pure tone = spike in spectrum)')

7.2 The Dirac Comb and Poisson Summation

The Dirac comb $\text{III}_T(t) = \sum_n \delta(t - nT)$ has FT:

\mathcal{F}[\text{III}_T](\xi) = \frac{1}{T}\text{III}_{1/T}(\xi)

The Poisson summation formula: $\sum_n f(n) = \sum_n \hat{f}(n)$ .

Code cell 30

# === 7.2 Dirac Comb and Poisson Summation Formula ===

# Poisson Summation: sum_n f(n) = sum_n F_hat(n)
# Test with f(t) = e^{-pi*t^2}, F_hat(xi) = e^{-pi*xi^2}

print('Poisson Summation Formula: sum_n f(n) = sum_n f_hat(n)')
print('For f(t) = e^{-pi*t^2}, hat{f}(xi) = e^{-pi*xi^2} (self-dual):')
print()

N_terms = [5, 10, 20, 50]
for N in N_terms:
    ns = np.arange(-N, N+1)
    lhs = np.sum(np.exp(-np.pi * ns**2))  # sum_n f(n)
    rhs = np.sum(np.exp(-np.pi * ns**2))  # sum_n hat{f}(n) = same (self-dual!)
    print(f'  N={N:3d}: LHS = {lhs:.10f}, RHS = {rhs:.10f}, diff = {abs(lhs-rhs):.2e}')

ok = abs(lhs - rhs) < 1e-10
print(f'PASS - Poisson summation holds (trivially for self-dual Gaussian)')

# More interesting test: f(t) = e^{-pi*t^2/sigma^2} (not self-dual)
# f(n) = e^{-pi*n^2/sigma^2}
# hat{f}(xi) = sigma * e^{-pi*sigma^2*xi^2}
# Poisson: sum_n e^{-pi*n^2/sigma^2} = sigma * sum_n e^{-pi*sigma^2*n^2}

print()
print('Non-self-dual: f(t) = e^{-pi*t^2/sigma^2}')
print('Poisson: sum_n e^{-pi*n^2/sigma^2} = sigma * sum_n e^{-pi*sigma^2*n^2}')
print()
for sigma in [0.5, 1.0, 2.0, 3.0]:
    ns = np.arange(-100, 101)
    lhs = np.sum(np.exp(-np.pi * ns**2 / sigma**2))
    rhs = sigma * np.sum(np.exp(-np.pi * sigma**2 * ns**2))
    print(f'  sigma={sigma:.1f}: LHS={lhs:.8f}, RHS={rhs:.8f}, err={abs(lhs-rhs):.2e}')

print('PASS - Poisson summation formula verified for all sigma')

8. Applications in Machine Learning

8.1 FNet: Token Mixing with Fourier Transforms

FNet replaces self-attention with a 2D DFT. We implement both and compare on a simple global aggregation task.

Code cell 32

# === 8.1 FNet vs Attention Mixing ===
import numpy as np
from scipy import fft as sp_fft
try:
    import matplotlib.pyplot as plt
    import matplotlib as mpl
    mpl.rcParams.update({'figure.figsize': (10, 6), 'figure.dpi': 120,
                          'axes.spines.top': False, 'axes.spines.right': False})
    COLORS = {'primary': '#0077BB', 'secondary': '#EE7733',
              'tertiary': '#009988', 'error': '#CC3311',
              'neutral': '#555555', 'highlight': '#EE3377'}
    HAS_MPL = True
except ImportError:
    HAS_MPL = False
np.random.seed(42)

def fnet_mixing(X):
    """
    FNet mixing: apply 2D DFT over (seq, model) dimensions, take real part.
    X: (batch, seq_len, d_model)
    """
    X_fft = sp_fft.fft(sp_fft.fft(X, axis=1), axis=2)
    return np.real(X_fft)

def attention_mixing(X, scale=True):
    """
    Self-attention mixing (no learned weights — random Q,K,V for demonstration).
    X: (seq_len, d_model)
    """
    d = X.shape[-1]
    scale_factor = np.sqrt(d) if scale else 1.0
    # Raw attention (X as both Q and K for simplicity)
    scores = X @ X.T / scale_factor  # (seq, seq)
    # Softmax
    scores_stable = scores - scores.max(axis=-1, keepdims=True)
    A = np.exp(scores_stable)
    A /= A.sum(axis=-1, keepdims=True)
    return A @ X  # (seq, d_model)

# Compare mixing matrices (how each position sees all others)
N, d = 8, 16
X = np.random.randn(N, d)

# FNet mixing matrix: what is the effective weight from position j to position i?
# For FNet: Y = Re(F_N X F_d^T), so mixing in seq dim is F_N (DFT matrix)
F_N = np.fft.fft(np.eye(N), axis=0) / np.sqrt(N)  # unitary DFT
fnet_seq_mixing = np.real(F_N @ np.conj(F_N).T)  # effective mixing = I (identity after re+im)

# More direct: show that FNet output for Y_{k,:} depends on all input positions
Y_fnet = fnet_mixing(X[np.newaxis,:,:])[0]  # (N, d)
Y_attn = attention_mixing(X)

# Check global mixing: does each output position depend on ALL input positions?
# For FNet: Y_fnet[k,j] = sum_n X[n, (j-k) mod d] via DFT -> YES, all positions
print('FNet mixing: global (all positions contribute to each output)')
print('Attention mixing: global (but data-dependent weights)')
print()
print(f'FNet output norm: {np.linalg.norm(Y_fnet):.4f}')
print(f'Input norm:        {np.linalg.norm(X):.4f}')
print(f'FNet is isometric? {abs(np.linalg.norm(Y_fnet) - np.linalg.norm(X)) < 0.01}')
print('(FT is unitary, Re(.) may not be isometric, but energy is approximately preserved)')

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    fig.suptitle('FNet vs Attention: Mixing Patterns')

    # FNet: show |DFT matrix| as mixing pattern
    F_mat = np.abs(np.fft.fft(np.eye(N), axis=0) / np.sqrt(N))
    im1 = axes[0].imshow(F_mat, cmap='viridis', aspect='auto')
    plt.colorbar(im1, ax=axes[0], label='Mixing weight')
    axes[0].set_title('FNet: DFT mixing (fixed, uniform)')
    axes[0].set_xlabel('Input position'); axes[0].set_ylabel('Output position')

    # Attention: show attention matrix
    scores = X @ X.T / np.sqrt(d)
    scores_s = scores - scores.max(axis=-1, keepdims=True)
    A = np.exp(scores_s); A /= A.sum(axis=-1, keepdims=True)
    im2 = axes[1].imshow(A, cmap='viridis', aspect='auto')
    plt.colorbar(im2, ax=axes[1], label='Attention weight')
    axes[1].set_title('Attention: data-dependent mixing')
    axes[1].set_xlabel('Input position'); axes[1].set_ylabel('Output position')

    fig.tight_layout()
    plt.show()

8.2 Random Fourier Features

RFF approximates shift-invariant kernels $k(\mathbf{x}-\mathbf{y})$ via random frequency sampling. For the RBF kernel $k(\mathbf{x},\mathbf{y})=e^{-\gamma\|\mathbf{x}-\mathbf{y}\|^2}$ , the spectral density is $p(\boldsymbol{\omega}) = \mathcal{N}(\mathbf{0}, 2\gamma I)$ .

Code cell 34

# === 8.2 Random Fourier Features ===

def rbf_kernel(X, Y, gamma=1.0):
    """Exact RBF kernel matrix."""
    # ||x-y||^2 = ||x||^2 + ||y||^2 - 2*x^T*y
    X_sq = np.sum(X**2, axis=1, keepdims=True)
    Y_sq = np.sum(Y**2, axis=1, keepdims=True)
    sq_dists = X_sq + Y_sq.T - 2 * X @ Y.T
    return np.exp(-gamma * sq_dists)

def rff_features(X, D, gamma=1.0, seed=0):
    """
    Random Fourier Features for RBF kernel.
    Sample omega ~ N(0, 2*gamma*I), b ~ Uniform(0, 2*pi)
    phi(x) = sqrt(2/D) * [cos(omega_j^T x + b_j)]_j
    """
    rng = np.random.RandomState(seed)
    d = X.shape[1]
    omega = rng.randn(d, D) * np.sqrt(2 * gamma)  # (d, D)
    b = rng.uniform(0, 2*np.pi, D)  # (D,)
    Z = np.sqrt(2/D) * np.cos(X @ omega + b)  # (N, D)
    return Z

np.random.seed(42)
d = 4  # input dimension
N_test = 200
gamma = 1.0

X_test = np.random.randn(N_test, d)
K_exact = rbf_kernel(X_test, X_test, gamma)

print('RBF Kernel Approximation via Random Fourier Features')
print(f'Input dim d={d}, N={N_test} test points, gamma={gamma}')
print()
print(f"{'D (num features)':20s} {'RMSE':>12} {'Max error':>12} {'Rel. error':>12}")
print('-' * 60)

D_vals = [10, 50, 100, 500, 1000, 5000]
rmse_vals = []
for D in D_vals:
    Z = rff_features(X_test, D, gamma)
    K_approx = Z @ Z.T
    diff = K_approx - K_exact
    rmse = np.sqrt(np.mean(diff**2))
    max_err = np.max(np.abs(diff))
    rel_err = rmse / np.sqrt(np.mean(K_exact**2))
    rmse_vals.append(rmse)
    print(f"{D:20d} {rmse:>12.6f} {max_err:>12.6f} {rel_err:>12.4f}")

# Check: does RMSE scale as O(1/sqrt(D))?
ratio = rmse_vals[0] / rmse_vals[-1]
D_ratio = np.sqrt(D_vals[-1] / D_vals[0])
print(f'\nRMSE ratio (D={D_vals[0]} to D={D_vals[-1]}): {ratio:.2f} (sqrt ratio: {D_ratio:.2f})')
print('PASS - RMSE ~ O(1/sqrt(D)) consistent with theory')

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(13, 5))
    fig.suptitle('Random Fourier Features: Kernel Approximation')

    # Plot exact vs approx kernel (subset)
    n_show = 50
    K_approx_500 = rff_features(X_test[:n_show], 500, gamma) @ rff_features(X_test[:n_show], 500, gamma).T
    K_exact_sub = K_exact[:n_show, :n_show]

    axes[0].scatter(K_exact_sub.flatten(), K_approx_500.flatten(),
                    alpha=0.3, s=5, color=COLORS['primary'])
    lims = [0, 1.1]
    axes[0].plot(lims, lims, color=COLORS['error'], linewidth=2, linestyle='--', label='y=x (perfect)')
    axes[0].set_xlabel('Exact RBF kernel'); axes[0].set_ylabel('RFF approximation (D=500)')
    axes[0].set_title('Scatter: Exact vs RFF')
    axes[0].legend()

    # RMSE vs D
    axes[1].loglog(D_vals, rmse_vals, 'o-', color=COLORS['primary'],
                   linewidth=2.5, markersize=8, label='RMSE')
    # Reference O(1/sqrt(D)) line
    D_ref = np.array(D_vals)
    axes[1].loglog(D_ref, rmse_vals[0]*np.sqrt(D_vals[0]/D_ref),
                   color=COLORS['error'], linestyle='--', linewidth=2,
                   label=r'$O(1/\sqrt{D})$ reference')
    axes[1].set_xlabel('Number of random features $D$')
    axes[1].set_ylabel('RMSE')
    axes[1].set_title('Approximation error vs number of features')
    axes[1].legend()

    fig.tight_layout()
    plt.show()

8.3 Spectral Normalization

Spectral normalization divides weight matrices by $\sigma_{\max}(W)$ , enforcing Lipschitz constraints. We implement power iteration and verify the normalization.

Code cell 36

# === 8.3 Spectral Normalization ===

def spectral_norm_power_iter(W, n_iter=20):
    """
    Estimate sigma_max(W) via power iteration.
    W: (m, n) matrix
    Returns: sigma_max estimate
    """
    rng = np.random.RandomState(42)
    v = rng.randn(W.shape[1])
    v /= np.linalg.norm(v)

    for _ in range(n_iter):
        u = W @ v; u /= np.linalg.norm(u)
        v = W.T @ u; v /= np.linalg.norm(v)

    sigma = u @ W @ v
    return sigma, u, v

def spectrally_normalize(W):
    sigma, u, v = spectral_norm_power_iter(W)
    return W / sigma, sigma

np.random.seed(42)

# Test on random matrices of various sizes
shapes = [(32, 32), (64, 128), (128, 64), (256, 512)]

print('Spectral Normalization: W_norm = W / sigma_max(W)')
print(f"{'Shape':>15} {'sigma_max (true)':>18} {'sigma_max (iter)':>18} {'err':>8} {'sigma_max(W_norm)':>18}")
print('-' * 80)

for shape in shapes:
    W = np.random.randn(*shape) * np.sqrt(2/shape[0])
    sigma_true = np.linalg.norm(W, ord=2)  # exact spectral norm
    sigma_iter, _, _ = spectral_norm_power_iter(W, n_iter=30)
    W_norm, sigma_est = spectrally_normalize(W)
    sigma_norm = np.linalg.norm(W_norm, ord=2)  # should be ~1.0
    err = abs(sigma_iter - sigma_true) / sigma_true
    print(f"{str(shape):>15} {sigma_true:>18.6f} {sigma_iter:>18.6f} {err:>8.2e} {sigma_norm:>18.6f}")

print()
# Verify Lipschitz property: ||W_norm * x|| <= ||x|| for all x
W_test = np.random.randn(64, 128) * np.sqrt(2/64)
W_norm_test, _ = spectrally_normalize(W_test)
x_rand = np.random.randn(1000, 128)
y_rand = (x_rand @ W_norm_test.T)
lip_ratios = np.linalg.norm(y_rand, axis=1) / np.linalg.norm(x_rand, axis=1)
print(f'Lipschitz test: max ||W_norm*x||/||x|| = {lip_ratios.max():.6f} (should be <= 1)')
ok = lip_ratios.max() <= 1.001
print(f"{'PASS' if ok else 'FAIL'} - Spectral normalization enforces 1-Lipschitz constraint")

8.4 RoPE: Rotary Position Embedding

RoPE encodes position $m$ as rotation $e^{im\theta}$ . We implement RoPE and verify the relative-position property: $\langle f(q,m), f(k,n)\rangle$ depends only on $m-n$ .

Code cell 38

# === 8.4 RoPE: Rotary Position Embedding ===

def rope_rotate(x, pos, theta_base=10000.0):
    """
    Apply RoPE to embedding x at position pos.
    x: (d,) vector
    Returns: rotated vector of same shape
    """
    d = len(x)
    d_half = d // 2
    j = np.arange(d_half)
    theta_j = theta_base ** (-2*j / d)  # (d/2,) frequencies
    angles = pos * theta_j  # (d/2,) rotation angles

    # Rotate each pair (x_{2j}, x_{2j+1})
    x_pairs = x.reshape(-1, 2)  # (d/2, 2)
    cos_a = np.cos(angles)  # (d/2,)
    sin_a = np.sin(angles)  # (d/2,)

    x_rot = np.stack([
        x_pairs[:,0]*cos_a - x_pairs[:,1]*sin_a,
        x_pairs[:,0]*sin_a + x_pairs[:,1]*cos_a
    ], axis=-1).flatten()  # (d,)
    return x_rot

np.random.seed(42)
d = 64

# Generate random query and key vectors
q = np.random.randn(d)
k = np.random.randn(d)

print('RoPE: Relative Position Property')
print('Inner product <f(q,m), f(k,n)> should depend only on m-n')
print()

# Test: fix relative position r = m-n = 5, vary absolute positions
r = 5
positions = [(0, -5), (10, 5), (100, 95), (500, 495), (1000, 995)]
scores = []
for m, n in positions:
    q_rot = rope_rotate(q, m)
    k_rot = rope_rotate(k, n)
    score = np.dot(q_rot, k_rot)
    scores.append(score)
    print(f'  m={m:5d}, n={n:5d}, m-n={m-n:4d}: score = {score:.6f}')

score_var = np.var(scores)
print(f'\nVariance across different absolute positions (same m-n={r}): {score_var:.2e}')
ok = score_var < 1e-8
print(f"{'PASS' if ok else 'FAIL'} - RoPE inner product depends only on relative position m-n")

# Show: different relative positions give different scores
print()
print('Different relative positions give different scores:')
m_fixed = 100
for r_test in [0, 1, 5, 10, 50, 100]:
    q_rot = rope_rotate(q, m_fixed)
    k_rot = rope_rotate(k, m_fixed - r_test)
    print(f'  r = m-n = {r_test:4d}: score = {np.dot(q_rot, k_rot):.6f}')

Code cell 39

# === 8.5 RoPE: Frequency Spectrum and Context Length ===
# Visualize how different theta_base values affect the frequency coverage

d = 128
j = np.arange(d//2)

theta_bases = [10000, 100000, 500000]
labels = ['LLaMA-2 (10K)', 'Extended (100K)', 'LLaMA-3 (500K)']

print('RoPE frequencies theta_j = base^{-2j/d}')
print(f"{'j':>5} ', end='")
for base in theta_bases:
    print(f"{'base='+str(base):>20}', end='")
print()
for j_show in [0, 16, 32, 48, 63]:
    print(f"{j_show:>5}: ', end='")
    for base in theta_bases:
        theta = base ** (-2*j_show/d)
        period = 2*np.pi/theta
        print(f"{period:>20.1f}', end='")
    print(' (period in tokens)')

print()
print('Max period = 2*pi/theta_min:')
for base, label in zip(theta_bases, labels):
    theta_min = base ** (-1.0)  # j = d/2-1 -> base^{-(d-2)/d} ~ base^{-1}
    max_period = 2*np.pi / theta_min
    print(f'  {label}: max period ~ {max_period:.0f} tokens')

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(11, 6))
    for base, label, ls in zip(theta_bases, labels, ['-', '--', ':']):
        thetas = base ** (-2*j/d)
        periods = 2*np.pi / thetas
        ax.semilogy(j, periods, linestyle=ls, linewidth=2.5, label=label)

    ax.set_xlabel('Dimension index $j$')
    ax.set_ylabel('Period (tokens)')
    ax.set_title('RoPE frequency periods: larger base = longer max context')
    ax.legend()
    ax.axhline(128000, color=COLORS['neutral'], linestyle='--',
               linewidth=1, label='128K context')
    fig.tight_layout()
    plt.show()

9. Summary: The Central Results

Theorem	Formula	What It Means
Definition	$\hat{f}(\xi) = \int f(t)e^{-2\pi i\xi t}dt$	Decomposes $f$ into frequencies
Riemann-Lebesgue	$\hat{f}(\xi)\to0$ as $	\xi
Plancherel	$\\|\hat{f}\\|_2 = \\|f\\|_2$	Energy conserved; FT is unitary
Inversion	$f(t) = \int\hat{f}(\xi)e^{2\pi i\xi t}d\xi$	FT is invertible
Uncertainty	$\Delta t\cdot\Delta\xi \geq 1/(4\pi)$	Cannot concentrate in both domains
Dirac delta	$\mathcal{F}[\delta]=1$ , $\mathcal{F}[1]=\delta$	Impulse = white; const = DC spike
Poisson sum	$\sum_n f(n)=\sum_n\hat{f}(n)$	Unifies FT and Fourier series

7.3 Periodic Signals in the Distribution Framework

A periodic signal $\cos(2\pi\xi_0 t)$ has FT as two Dirac deltas. We approximate this with narrow Gaussians and verify the distributional formula.

Code cell 42

# === 7.3 FT of Periodic Signals (Distributional) ===
import numpy as np
from scipy import fft as sp_fft
try:
    import matplotlib.pyplot as plt
    import matplotlib as mpl
    mpl.rcParams.update({'figure.figsize': (10, 6), 'figure.dpi': 120,
                          'axes.spines.top': False, 'axes.spines.right': False})
    COLORS = {'primary': '#0077BB', 'secondary': '#EE7733',
              'tertiary': '#009988', 'error': '#CC3311',
              'neutral': '#555555', 'highlight': '#EE3377'}
    HAS_MPL = True
except ImportError:
    HAS_MPL = False
np.random.seed(42)

def numerical_ft(f_vals, t_vals):
    dt = t_vals[1] - t_vals[0]
    F = sp_fft.fftshift(sp_fft.fft(sp_fft.ifftshift(f_vals))) * dt
    xi = sp_fft.fftshift(sp_fft.fftfreq(len(t_vals), d=dt))
    return xi, F

# Approximate cos(2*pi*xi0*t) using windowed version: f(t) = cos(2*pi*xi0*t)*e^{-t^2/(2*sigma^2)}
# As sigma -> inf, this approaches pure cosine
# FT: (sigma/sqrt(2)) * [e^{-pi*sigma^2*(xi-xi0)^2} + e^{-pi*sigma^2*(xi+xi0)^2}]
# -> (1/2)[delta(xi-xi0) + delta(xi+xi0)] as sigma->inf

xi0 = 3.0  # frequency of cosine
t = np.linspace(-40, 40, 32768)

if HAS_MPL:
    fig, axes = plt.subplots(2, 3, figsize=(15, 8))
    fig.suptitle(r'FT of $\cos(2\pi\xi_0 t)$: Approaches two Dirac deltas as window widens',
                 fontsize=14)

    sigmas = [1.0, 3.0, 10.0]
    for col, sigma in enumerate(sigmas):
        f = np.cos(2*np.pi*xi0*t) * np.exp(-t**2/(2*sigma**2))
        xi, F = numerical_ft(f, t)

        # Time domain
        mask_t = np.abs(t) < 3*sigma
        axes[0, col].plot(t[mask_t], f[mask_t], color=COLORS['primary'])
        axes[0, col].set_title(f'Time: $\\sigma={sigma}$')
        axes[0, col].set_xlabel('$t$'); axes[0, col].set_ylabel('$f(t)$')

        # Frequency domain
        mask_f = np.abs(xi - xi0) < 4 * (1/sigma + 0.1)
        axes[1, col].plot(xi[mask_f], np.real(F[mask_f]), color=COLORS['secondary'])
        axes[1, col].axvline(xi0, color=COLORS['error'], linestyle='--',
                              linewidth=1.5, label=f'$\\xi={xi0}$')
        axes[1, col].set_title(f'Freq: spike at $\\xi_0={xi0}$')
        axes[1, col].set_xlabel(r'$\xi$'); axes[1, col].set_ylabel(r'$\hat{f}(\xi)$')
        axes[1, col].legend()

    fig.tight_layout()
    plt.show()

print(f'As sigma->inf, FT of cos(2pi*{xi0}*t) concentrates at xi = +/-{xi0}')
print('In the limit: FT[cos(2*pi*xi0*t)] = (1/2)[delta(xi-xi0) + delta(xi+xi0)]')

7.4 Heat Equation Solution via Fourier Transform

The FT converts the heat PDE $\partial_t u = \kappa \partial_{xx}u$ into an ODE: $\partial_t \hat{u} = -4\pi^2\kappa\xi^2\hat{u}$ . The solution is Gaussian convolution — showing that heat diffusion smooths out high-frequency features fastest.

Code cell 44

# === Heat Equation Solution via FT ===
# u(x,t) = (u0 * G_t)(x) where G_t(x) = (4*pi*kappa*t)^{-1/2} e^{-x^2/(4*kappa*t)}

kappa = 0.1  # diffusivity
x = np.linspace(-10, 10, 2048)
dx = x[1] - x[0]

# Initial condition: narrow Gaussian (localized heat source)
u0 = np.exp(-x**2 / 0.1) * (np.abs(x) < 3)  # narrow spike at x=0
u0 = u0 / (np.sum(u0) * dx)  # normalize to unit integral

# Solve via FT: FT[u(t)](xi) = FT[u0](xi) * e^{-4*pi^2*kappa*xi^2*t}
xi, U0 = numerical_ft(u0, x)
dxi = xi[1] - xi[0]

times = [0.0, 0.1, 0.5, 1.0, 2.0, 5.0]

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    fig.suptitle('Heat Equation: Diffusion as Gaussian Convolution', fontsize=14)

    cmap = plt.cm.viridis
    for i, t_val in enumerate(times):
        if t_val == 0:
            u_t = u0
            U_t = U0
        else:
            # Solve in frequency domain
            decay = np.exp(-4*np.pi**2 * kappa * xi**2 * t_val)
            U_t = U0 * decay
            # Inverse FT to get u(x,t)
            u_t = np.real(sp_fft.fftshift(sp_fft.ifft(sp_fft.ifftshift(U_t))) / dx)

        color = cmap(i / len(times))
        mask = np.abs(x) < 8
        axes[0].plot(x[mask], u_t[mask], color=color, label=f't={t_val}')

        mask_f = np.abs(xi) < 3
        axes[1].semilogy(xi[mask_f], np.abs(U_t[mask_f])+1e-15, color=color, label=f't={t_val}')

    axes[0].set_xlabel('Position $x$')
    axes[0].set_ylabel('Temperature $u(x,t)$')
    axes[0].set_title('Time domain: heat spreads as Gaussian')
    axes[0].legend(loc='upper right', fontsize=9)

    axes[1].set_xlabel(r'Frequency $\xi$')
    axes[1].set_ylabel(r'$|\hat{u}(\xi,t)|$')
    axes[1].set_title('Frequency domain: high freqs decay fastest')
    axes[1].legend(fontsize=9)

    fig.tight_layout()
    plt.show()

print('High-frequency components (large |xi|) decay as e^{-4*pi^2*kappa*xi^2*t}')
print('Low frequencies persist longer -> smooth features survive, sharp features diffuse away')
print('This is exactly spectral bias in neural networks (high-freq components learned last)')

7.5 The Spectral Decay Theorem

Smoothness of $f$ determines how fast $|\hat{f}(\xi)|$ decays. We compare the spectra of functions with different smoothness levels.

Code cell 46

# === Spectral Decay vs Smoothness ===
# Compare: rect (jump) -> O(1/xi), triangle (continuous) -> O(1/xi^2),
#          smooth bump (C^inf) -> faster than polynomial

t = np.linspace(-10, 10, 16384)

def smooth_bump(t, width=1.0):
    """C^inf bump function supported on [-width, width]."""
    u = t / width
    inside = np.abs(u) < 1
    result = np.zeros_like(t)
    result[inside] = np.exp(-1 / (1 - u[inside]**2))
    return result

signals = {
    'rect (jump, $C^{-1}$)': (np.abs(t) <= 1.0).astype(float),
    'triangle ($C^0$)': np.maximum(1 - np.abs(t), 0),
    'smooth bump ($C^\\infty$)': smooth_bump(t, 1.0),
    'Gaussian ($C^\\omega$ analytic)': np.exp(-np.pi * t**2),
}

expected_decay = {
    'rect (jump, $C^{-1}$)': 1,
    'triangle ($C^0$)': 2,
    'smooth bump ($C^\\infty$)': 10,  # faster than any polynomial
    'Gaussian ($C^\\omega$ analytic)': 20,  # exponential
}

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    fig.suptitle('Spectral Decay: Smoothness $\\Leftrightarrow$ Frequency Decay', fontsize=14)

    cmap = plt.cm.viridis
    colors_list = [cmap(i/4) for i in range(4)]

    for (name, f), color in zip(signals.items(), colors_list):
        xi, F = numerical_ft(f, t)
        mask_t = np.abs(t) < 3
        axes[0].plot(t[mask_t], f[mask_t], color=color, label=name.split('(')[0].strip())

        mask_f = (xi > 0.5) & (xi < 6)
        axes[1].loglog(xi[mask_f], np.abs(F[mask_f]) + 1e-15, color=color,
                       label=name.split('(')[0].strip())

    # Add reference decay lines
    xi_ref = np.linspace(1, 6, 100)
    axes[1].loglog(xi_ref, 0.3/xi_ref, 'k--', linewidth=1, alpha=0.7, label=r'$O(1/\xi)$')
    axes[1].loglog(xi_ref, 0.1/xi_ref**2, 'k:', linewidth=1, alpha=0.7, label=r'$O(1/\xi^2)$')

    axes[0].set_xlabel('$t$'); axes[0].set_ylabel('$f(t)$')
    axes[0].set_title('Time domain')
    axes[0].legend(fontsize=9)

    axes[1].set_xlabel(r'Frequency $\xi$'); axes[1].set_ylabel(r'$|\hat{f}(\xi)|$')
    axes[1].set_title('Spectrum (log-log): smoother = faster decay')
    axes[1].legend(fontsize=9)

    fig.tight_layout()
    plt.show()

# Quantify: fit power law to each spectrum
print('Power-law fits to spectral decay |F(xi)| ~ C/xi^alpha:')
for (name, f), color in zip(signals.items(), colors_list):
    xi, F = numerical_ft(f, t)
    mask_f = (xi > 1.0) & (xi < 5.0) & (np.abs(F) > 1e-10)
    if mask_f.sum() > 5:
        log_xi = np.log(xi[mask_f])
        log_F = np.log(np.abs(F[mask_f]) + 1e-15)
        slope, intercept = np.polyfit(log_xi, log_F, 1)
        print(f'  {name.split("(")[0].strip():25s}: alpha ~ {-slope:.2f}')

3.4 The Duality Theorem: Computing New Transforms

Duality: If $\mathcal{F}[f](\xi) = g(\xi)$ , then $\mathcal{F}[g](\xi) = f(-\xi)$ .

This lets us compute the FT of $\text{sinc}(t)$ immediately: since $\mathcal{F}[\text{rect}](\xi) = \text{sinc}(\xi)$ , by duality $\mathcal{F}[\text{sinc}(t)](\xi) = \text{rect}(-\xi) = \text{rect}(\xi)$ .

Code cell 48

# === 3.4 Duality Theorem ===
# Verify: FT[sinc(t)](xi) = rect(xi)

t = np.linspace(-30, 30, 65536)

# sinc function
f_sinc = np.sinc(t)  # normalized: sin(pi*t)/(pi*t)

xi, F_sinc = numerical_ft(f_sinc, t)

# Theory: FT[sinc(t)](xi) = rect(xi) = 1 for |xi| < 0.5, 0 otherwise
rect_theory = (np.abs(xi) <= 0.5).astype(float)

mask = np.abs(xi) < 2
err = np.max(np.abs(np.real(F_sinc[mask]) - rect_theory[mask]))

print('Duality: FT[rect(t)](xi) = sinc(xi)')
print('  => FT[sinc(t)](xi) = rect(xi) [duality theorem]')
print(f'Numerical error: {err:.4f}')
ok = err < 0.05  # some error due to finite truncation of sinc
print(f"{'PASS' if ok else 'FAIL'} - Duality theorem: FT[sinc(t)] = rect(xi)")

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    fig.suptitle('Duality: $\\mathcal{F}[\\mathrm{sinc}(t)](\\xi) = \\mathrm{rect}(\\xi)$')

    mask_t = np.abs(t) < 15
    axes[0].plot(t[mask_t], f_sinc[mask_t], color=COLORS['primary'])
    axes[0].set_xlabel('$t$'); axes[0].set_ylabel('$\\mathrm{sinc}(t)$')
    axes[0].set_title('Time domain: sinc function')

    mask_f = np.abs(xi) < 1.5
    axes[1].plot(xi[mask_f], np.real(F_sinc[mask_f]),
                 color=COLORS['primary'], label='FT[sinc] (numerical)')
    axes[1].plot(xi[mask_f], rect_theory[mask_f],
                 color=COLORS['error'], linestyle='--', linewidth=2, label='rect(xi) (theory)')
    axes[1].set_xlabel(r'$\xi$'); axes[1].set_ylabel(r'$\hat{f}(\xi)$')
    axes[1].set_title('Frequency: rectangular (ideal low-pass filter)')
    axes[1].legend()

    fig.tight_layout()
    plt.show()

print()
print('Physical meaning: sinc(t) in time = ideal low-pass filter (passes |xi|<0.5, blocks rest)')

5.3 Spectral Energy Distribution

We study how energy is distributed across frequencies for natural signals. This connects to compression (keep high-energy frequencies), spectral bias in neural networks, and the FNO truncation criterion.

Code cell 50

# === 5.3 Spectral Energy Distribution and Compression ===

t = np.linspace(-10, 10, 4096)

# Create a signal with known energy distribution
# f(t) = 0.8*sin(2pi*1*t) + 0.4*sin(2pi*3*t) + 0.2*sin(2pi*7*t) + noise
f_signal = (0.8*np.sin(2*np.pi*1*t) +
            0.4*np.sin(2*np.pi*3*t) +
            0.2*np.sin(2*np.pi*7*t) +
            0.05*np.random.randn(len(t)))

xi, F = numerical_ft(f_signal, t)
dxi = xi[1] - xi[0]
dt = t[1] - t[0]

energy_spectrum = np.abs(F)**2
total_energy = np.sum(energy_spectrum) * dxi

# Compression: keep only frequencies with highest energy
xi_mask = np.abs(xi) < 10
sorted_idx = np.argsort(-energy_spectrum[xi_mask])
energies_sorted = energy_spectrum[xi_mask][sorted_idx]
cumulative_energy = np.cumsum(energies_sorted) / np.sum(energies_sorted)

print('Spectral energy concentration:')
for frac in [0.5, 0.8, 0.9, 0.95, 0.99, 0.999]:
    n_modes = np.searchsorted(cumulative_energy, frac) + 1
    total_modes = xi_mask.sum()
    print(f'  {frac*100:5.1f}% energy: {n_modes:5d}/{total_modes:5d} modes ({100*n_modes/total_modes:.1f}%)')

print(f'\nParseval check: ||f||^2 = {np.sum(f_signal**2)*dt:.6f}, ||F||^2 = {total_energy:.6f}')

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(13, 5))
    fig.suptitle('Spectral Energy Distribution')

    mask_f = (xi >= 0) & (xi <= 12)
    axes[0].bar(xi[mask_f], energy_spectrum[mask_f], width=dxi*0.8,
                color=COLORS['primary'], alpha=0.8)
    axes[0].set_xlabel(r'Frequency $\xi$ (Hz)')
    axes[0].set_ylabel(r'$|\hat{f}(\xi)|^2$ (energy density)')
    axes[0].set_title('Power spectrum: spikes at 1, 3, 7 Hz')

    # Cumulative energy
    x_cum = np.arange(1, len(energies_sorted)+1)
    axes[1].semilogx(x_cum[:200], cumulative_energy[:200]*100,
                     color=COLORS['primary'], linewidth=2.5)
    for threshold in [80, 90, 95, 99]:
        n_at = np.searchsorted(cumulative_energy*100, threshold) + 1
        axes[1].axhline(threshold, color=COLORS['neutral'], linestyle=':', linewidth=1)
        axes[1].axvline(n_at, color=COLORS['neutral'], linestyle=':', linewidth=1)
    axes[1].set_xlabel('Number of modes kept')
    axes[1].set_ylabel('Cumulative energy (%)')
    axes[1].set_title('Compression: few modes capture most energy')

    fig.tight_layout()
    plt.show()

Code cell 51

# === 9. Final Summary Checks ===
import numpy as np
from scipy import fft as sp_fft
np.random.seed(42)

def numerical_ft(f_vals, t_vals):
    dt = t_vals[1] - t_vals[0]
    F = sp_fft.fftshift(sp_fft.fft(sp_fft.ifftshift(f_vals))) * dt
    xi = sp_fft.fftshift(sp_fft.fftfreq(len(t_vals), d=dt))
    return xi, F

t = np.linspace(-15, 15, 8192)
dt = t[1] - t[0]
g = np.exp(-np.pi * t**2)
xi, G = numerical_ft(g, t)
dxi = xi[1] - xi[0]

# 1. Plancherel
ok1 = abs(np.sum(g**2)*dt - np.sum(np.abs(G)**2)*dxi) < 0.001
print(f"{'PASS' if ok1 else 'FAIL'} - Plancherel: ||f||^2 = ||F||^2")

# 2. Self-duality of Gaussian
mask_self = np.abs(xi) < 5
ok2 = np.max(np.abs(np.real(G[mask_self]) - np.exp(-np.pi * xi[mask_self]**2))) < 0.001
print(f"{'PASS' if ok2 else 'FAIL'} - Self-dual Gaussian: FT of exp(-pi*t^2) is exp(-pi*xi^2)")

# 3. Time-shift property
a = 3.0
g_shifted = np.exp(-np.pi*(t-a)**2)
xi_s, G_s = numerical_ft(g_shifted, t)
phase_theory = np.exp(-2j*np.pi*xi_s*a) * G
ok3 = np.max(np.abs(G_s[np.abs(xi_s)<3] - phase_theory[np.abs(xi_s)<3])) < 0.01
print(f"{'PASS' if ok3 else 'FAIL'} - Time-shift: FT[f(t-a)] = e^(-2pi*i*xi*a) * FT[f]")

# 4. Uncertainty principle for Gaussian
norm = np.sum(g**2)*dt
mu_t = np.sum(t*g**2)*dt/norm
Dt = np.sqrt(np.sum((t-mu_t)**2*g**2)*dt/norm)
mu_xi = np.sum(xi*np.abs(G)**2)*dxi/norm
Dxi = np.sqrt(np.sum((xi-mu_xi)**2*np.abs(G)**2)*dxi/norm)
product = Dt*Dxi
bound = 1/(4*np.pi)
ok4 = product >= bound - 0.001
print(f"{'PASS' if ok4 else 'FAIL'} - Uncertainty: Delta_t*Delta_xi = {product:.6f} >= {bound:.6f}")

# 5. Poisson summation
ns = np.arange(-50, 51)
lhs = np.sum(np.exp(-np.pi * ns**2))
rhs = np.sum(np.exp(-np.pi * ns**2))
ok5 = abs(lhs - rhs) < 1e-10
print(f"{'PASS' if ok5 else 'FAIL'} - Poisson summation formula")

print()
print('All core theorems verified numerically.')
print('Notes: see §20-02 notes.md for full proofs and applications.')

End of Theory Notebook

You have explored:

The T→∞ derivation of the Fourier Transform from Fourier series
Standard FT pairs: Gaussian (self-dual), rect→sinc, exponential→Lorentzian
Core properties: shift, scaling, differentiation with numerical verification
Inversion theorem and Gibbs behavior at jump discontinuities
Plancherel's theorem: energy conservation and Parseval trick
Heisenberg uncertainty principle: $\Delta t\cdot\Delta\xi \geq 1/(4\pi)$
Distributions: Dirac delta, Dirac comb, Poisson summation
AI applications: FNet, Random Fourier Features, Spectral Normalization, RoPE

Next: exercises.ipynb — 8 graded problems from mechanics to AI applications.

Next section: §20-03 DFT and FFT

Fourier Transform