Theory Notebook
Converted from
theory.ipynbfor web reading.
§6.2 Common Distributions — Theory Notebook
"The art of asking the right questions in mathematics is more important than the art of solving them." — Georg Cantor
Interactive exploration of discrete and continuous distributions, MGFs, exponential family, conjugate priors, and their roles in modern machine learning.
Sections:
- Discrete Distributions (Bernoulli → Poisson)
- Categorical and Multinomial
- Continuous Distributions (Uniform, Gaussian, Exponential, Gamma, Beta)
- Dirichlet Distribution
- Student-t and Heavy Tails
- Moment Generating Functions
- Exponential Family
- Conjugate Priors
- ML Applications
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import numpy as np
import scipy.linalg as la
from scipy import stats
from scipy.special import gamma, digamma, gammaln
try:
import matplotlib.pyplot as plt
import matplotlib
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = [12, 5]
plt.rcParams['font.size'] = 12
HAS_MPL = True
except ImportError:
HAS_MPL = False
try:
import seaborn as sns
HAS_SNS = True
except ImportError:
HAS_SNS = False
np.set_printoptions(precision=6, suppress=True)
np.random.seed(42)
def check(name, got, expected, tol=1e-8):
ok = np.allclose(got, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} — {name}")
return ok
print('Setup complete.')
1. Discrete Distributions
1.1 Bernoulli Distribution
The simplest random variable: a single binary trial with success probability .
PMF unified form: for
Moments: ,
For AI: Every binary classification output is Bernoulli; binary cross-entropy loss is the negative log-likelihood .
Code cell 5
# === 1.1 Bernoulli Distribution ===
p_vals = [0.1, 0.3, 0.5, 0.7, 0.9]
n_sim = 10000
print('Bernoulli: Analytical vs Simulated Moments')
print(f"{'p':>6} {'E[X]':>8} {'Var(X)':>8} {'sim_mean':>10} {'sim_var':>10}")
for p in p_vals:
rv = stats.bernoulli(p)
samples = rv.rvs(n_sim)
print(f"{p:>6.1f} {p:>8.4f} {p*(1-p):>8.4f} {samples.mean():>10.4f} {samples.var():>10.4f}")
# Verify max variance at p=0.5
p_range = np.linspace(0.01, 0.99, 200)
variance = p_range * (1 - p_range)
print(f'\nMax variance {variance.max():.4f} at p = {p_range[variance.argmax()]:.3f}')
check('Max variance at p=0.5', p_range[variance.argmax()], 0.5, tol=0.01)
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
# PMF
ax = axes[0]
for p in [0.2, 0.5, 0.8]:
ax.bar([0 - 0.1*(p-0.5), 1 - 0.1*(p-0.5)], [1-p, p], width=0.1, alpha=0.7, label=f'p={p}')
ax.set_xlabel('k'); ax.set_ylabel('P(X=k)')
ax.set_title('Bernoulli PMF'); ax.legend(); ax.set_xticks([0, 1])
# Variance curve
ax = axes[1]
ax.plot(p_range, variance, 'steelblue', lw=2)
ax.axvline(0.5, color='red', ls='--', alpha=0.5, label='p=0.5 (max)')
ax.set_xlabel('p'); ax.set_ylabel('Var(X) = p(1-p)')
ax.set_title('Bernoulli Variance vs p'); ax.legend()
plt.tight_layout(); plt.show()
1.2 Binomial Distribution
Sum of independent Bernoulli() trials.
Moments: ,
Poisson limit: As , with fixed,
Normal limit (CLT): For large ,
Code cell 7
# === 1.2 Binomial Distribution ===
n, p = 20, 0.3
rv = stats.binom(n, p)
k_vals = np.arange(0, n+1)
pmf = rv.pmf(k_vals)
print(f'Binomial(n={n}, p={p})')
print(f'E[X] = {rv.mean():.4f} (theory: {n*p:.4f})')
print(f'Var(X) = {rv.var():.4f} (theory: {n*p*(1-p):.4f})')
check('E[X] = np', rv.mean(), n*p)
check('Var(X) = np(1-p)', rv.var(), n*p*(1-p))
# Poisson approximation: n large, p small, np=lambda fixed
lambda_ = 3.0
n_large, p_small = 200, lambda_/200
binom_rv = stats.binom(n_large, p_small)
pois_rv = stats.poisson(lambda_)
k_test = np.arange(0, 12)
tvd = 0.5 * np.sum(np.abs(binom_rv.pmf(k_test) - pois_rv.pmf(k_test)))
print(f'\nPoisson approx: TVD = {tvd:.6f} (smaller is better)')
check('Binom→Poisson TVD < 0.01', tvd, 0.0, tol=0.01)
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
ax = axes[0]
ax.bar(k_vals, pmf, alpha=0.7, color='steelblue', label='PMF')
mu = n*p; sigma = np.sqrt(n*p*(1-p))
x_fine = np.linspace(0, n, 300)
ax.plot(x_fine, stats.norm.pdf(x_fine, mu, sigma), 'r-', lw=2, label=f'N({mu:.0f},{sigma:.2f}²)')
ax.set_xlabel('k'); ax.set_ylabel('P(X=k)'); ax.set_title(f'Binomial({n},{p}) + Normal approx')
ax.legend()
ax = axes[1]
ax.bar(k_test - 0.2, binom_rv.pmf(k_test), width=0.4, alpha=0.7, label=f'Binom({n_large},{p_small:.3f})')
ax.bar(k_test + 0.2, pois_rv.pmf(k_test), width=0.4, alpha=0.7, label=f'Poisson({lambda_})')
ax.set_xlabel('k'); ax.set_ylabel('P(X=k)'); ax.set_title('Poisson Approximation to Binomial')
ax.legend()
plt.tight_layout(); plt.show()
1.3 Geometric and Negative Binomial Distributions
Geometric: number of trials until first success.
Moments: ,
Memoryless property: — the unique discrete memoryless distribution.
Negative Binomial: number of trials until -th success.
Moments: ,
For AI: Geometric models sequence length in autoregressive generation (EOS token probability).
Code cell 9
# === 1.3 Geometric and Negative Binomial ===
p = 0.2
geom_rv = stats.geom(p)
print(f'Geometric(p={p})')
print(f'E[X] = {geom_rv.mean():.4f} (theory: {1/p:.4f})')
print(f'Var(X) = {geom_rv.var():.4f} (theory: {(1-p)/p**2:.4f})')
check('E[X] = 1/p', geom_rv.mean(), 1/p)
check('Var(X) = (1-p)/p^2', geom_rv.var(), (1-p)/p**2)
# Memoryless property: P(X > s+t | X > s) = P(X > t)
s, t = 5, 3
lhs = geom_rv.sf(s + t) / geom_rv.sf(s)
rhs = geom_rv.sf(t)
check(f'Memoryless: P(X>{s+t}|X>{s}) = P(X>{t})', lhs, rhs)
# Negative Binomial
r = 3
nb_rv = stats.nbinom(r, p)
print(f'\nNegBinom(r={r}, p={p})')
print(f'E[X] = {nb_rv.mean():.4f} (theory: {r/p:.4f})')
print(f'Var(X) = {nb_rv.var():.4f} (theory: {r*(1-p)/p**2:.4f})')
check('NegBinom E[X]', nb_rv.mean(), r/p)
# Over-dispersion: NegBinom has Var > Mean
print(f'\nOver-dispersion check: Var/Mean = {nb_rv.var()/nb_rv.mean():.4f} > 1')
check('NegBinom over-dispersed', nb_rv.var() > nb_rv.mean(), True)
1.4 Poisson Distribution
Models the number of events in a fixed interval when events occur independently at rate .
Moments: — mean equals variance.
Poisson additivity: If and independently, then .
For AI: Rare token occurrence models; event counting in streaming text processing.
Code cell 11
# === 1.4 Poisson Distribution ===
lambdas = [0.5, 1, 3, 8]
k_max = 20
k_vals = np.arange(0, k_max+1)
print('Poisson: mean = variance check')
for lam in lambdas:
rv = stats.poisson(lam)
check(f'Pois({lam}) E[X]=Var(X)', rv.mean(), rv.var())
# Additivity
lam1, lam2 = 2.0, 3.0
n_sim = 50000
X = stats.poisson(lam1).rvs(n_sim)
Y = stats.poisson(lam2).rvs(n_sim)
Z = X + Y
ks_stat, p_val = stats.kstest(Z, lambda x: stats.poisson(lam1+lam2).cdf(x))
print(f'\nAdditivity: X+Y ~ Pois({lam1+lam2})?')
print(f' Simulated mean={Z.mean():.4f}, var={Z.var():.4f}, expected={lam1+lam2}')
check('Additivity: mean of X+Y', Z.mean(), lam1+lam2, tol=0.05)
if HAS_MPL:
fig, ax = plt.subplots(figsize=(10, 5))
colors = plt.cm.viridis(np.linspace(0.2, 0.9, len(lambdas)))
for lam, c in zip(lambdas, colors):
rv = stats.poisson(lam)
ax.plot(k_vals, rv.pmf(k_vals), 'o-', color=c, label=f'λ={lam}', lw=1.5, ms=5)
ax.set_xlabel('k'); ax.set_ylabel('P(X=k)')
ax.set_title('Poisson PMF for Various λ'); ax.legend()
plt.tight_layout(); plt.show()
2. Categorical and Multinomial
2.1 Categorical Distribution
Generalises Bernoulli to outcomes.
Softmax parameterisation: Given logits ,
For AI: Every language model output is Categorical over the vocabulary; the final linear layer + softmax defines the distribution over next tokens.
2.2 Multinomial Distribution
Moments: , ,
Code cell 13
# === 2.1 Categorical and Multinomial ===
# Softmax parameterisation
def softmax(z):
z = z - z.max() # numerical stability
e = np.exp(z)
return e / e.sum()
K = 5
logits = np.array([2.0, 1.0, 0.5, -0.5, -1.0])
p = softmax(logits)
print(f'Logits: {logits}')
print(f'Probabilities: {p}')
print(f'Sum = {p.sum():.6f}')
check('Softmax sums to 1', p.sum(), 1.0)
check('Softmax all positive', (p > 0).all(), True)
# Temperature scaling
print('\nTemperature scaling effect:')
for T in [0.1, 0.5, 1.0, 2.0, 5.0]:
p_T = softmax(logits / T)
entropy = -np.sum(p_T * np.log(p_T + 1e-10))
print(f' T={T}: max_prob={p_T.max():.4f}, entropy={entropy:.4f}')
# Multinomial
n_draws, n_sim = 100, 10000
counts = np.random.multinomial(n_draws, p, size=n_sim)
print(f'\nMultinomial(n={n_draws}, p):')
print(f'E[X_k] = {counts.mean(axis=0).round(2)}')
print(f'Theory: {(n_draws*p).round(2)}')
check('Multinomial means', counts.mean(axis=0), n_draws*p, tol=0.5)
3. Continuous Distributions
3.1 Uniform Distribution
Moments: ,
Universality of the Uniform: If is a continuous CDF, then . Conversely, for — the inverse transform method.
Code cell 15
# === 3.1 Uniform Distribution and Inverse Transform ===
import numpy as np
from scipy import stats
try:
import matplotlib.pyplot as plt
HAS_MPL = True
except ImportError:
HAS_MPL = False
np.random.seed(42)
def check(name, got, expected, tol=1e-8):
ok = np.allclose(got, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} — {name}")
return ok
# Inverse transform: U ~ Uniform → X ~ Exp(lambda)
lambda_ = 2.0
n = 50000
U = np.random.uniform(0, 1, n)
X = -np.log(1 - U) / lambda_ # F^{-1}(u) = -log(1-u)/lambda
exp_rv = stats.expon(scale=1/lambda_)
ks_stat, p_val = stats.kstest(X, exp_rv.cdf)
print(f'Inverse Transform: U -> Exp({lambda_})')
print(f' Sample mean = {X.mean():.4f}, expected = {1/lambda_:.4f}')
print(f' KS test: stat={ks_stat:.4f}, p={p_val:.4f}')
check('Inverse transform mean', X.mean(), 1/lambda_, tol=0.02)
check('KS test passes (p > 0.05)', p_val > 0.05, True)
# Universality: Phi(X) ~ Uniform when X ~ Normal
X_norm = np.random.standard_normal(n)
U2 = stats.norm.cdf(X_norm)
ks2, p2 = stats.kstest(U2, 'uniform')
print(f'\nUniversality: Phi(N(0,1)) ~ Uniform(0,1)?')
print(f' KS test: stat={ks2:.4f}, p={p2:.4f}')
check('Universality KS test passes', p2 > 0.05, True)
3.2 Gaussian Distribution
The most important continuous distribution in ML.
Standard normal:
Moments: , , all odd central moments = 0
68-95-99.7 rule: = 68.27% (), 95.45% (), 99.73% ()
MGF:
Stability:
For AI: Weight initialisation (Xavier/Kaiming use Gaussian), VAE latent space, diffusion model noise schedule, Gaussian attention approximations.
Code cell 17
# === 3.2 Gaussian Distribution ===
mu, sigma = 2.0, 1.5
rv = stats.norm(loc=mu, scale=sigma)
n = 100000
X = rv.rvs(n)
print(f'Gaussian(mu={mu}, sigma^2={sigma**2})')
print(f'E[X] = {X.mean():.4f} (theory: {mu})')
print(f'Var(X) = {X.var():.4f} (theory: {sigma**2})')
print(f'Skew = {stats.skew(X):.4f} (theory: 0)')
print(f'Kurt = {stats.kurtosis(X):.4f} (theory: 0, excess kurtosis)')
# 68-95-99.7 rule
print('\n68-95-99.7 rule:')
for k in [1, 2, 3]:
theory = stats.norm.cdf(k) - stats.norm.cdf(-k)
empirical = np.mean(np.abs(X - mu) <= k * sigma)
print(f' k={k}: theory={theory:.4f}, empirical={empirical:.4f}')
# Stability under addition
n_sim = 50000
mu1, s1, mu2, s2 = 1.0, 1.0, 2.0, 1.5
X1 = np.random.normal(mu1, s1, n_sim)
X2 = np.random.normal(mu2, s2, n_sim)
Z = X1 + X2
print(f'\nStability: X1+X2 ~ N({mu1+mu2}, {s1**2+s2**2})?')
print(f' Z mean = {Z.mean():.4f}, var = {Z.var():.4f}')
print(f' Theory: mean={mu1+mu2}, var={s1**2+s2**2}')
check = lambda n, g, e, t=0.05: print(f"{'PASS' if abs(g-e)<t else 'FAIL'} — {n}: {g:.4f}")
check('Z mean', Z.mean(), mu1+mu2)
check('Z var', Z.var(), s1**2+s2**2)
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
x_range = np.linspace(-4, 8, 400)
ax = axes[0]
for (m, s, c) in [(0, 1, 'steelblue'), (2, 1.5, 'firebrick'), (0, 2, 'forestgreen')]:
ax.plot(x_range, stats.norm.pdf(x_range, m, s), lw=2, label=f'N({m},{s}²)')
ax.set_xlabel('x'); ax.set_ylabel('f(x)'); ax.set_title('Gaussian PDFs'); ax.legend()
ax = axes[1]
ax.hist(Z, bins=80, density=True, alpha=0.6, color='steelblue', label='X1+X2')
ax.plot(x_range, stats.norm.pdf(x_range, mu1+mu2, np.sqrt(s1**2+s2**2)),
'r-', lw=2, label='Theory N(3, 3.25)')
ax.set_xlabel('z'); ax.set_title('Stability of Gaussian'); ax.legend()
plt.tight_layout(); plt.show()
3.3 Exponential Distribution
Models time between events in a Poisson process.
Moments: , ,
CDF: (exact, no approximation)
Memoryless property (continuous):
The Exponential is the unique continuous memoryless distribution.
Connection to Poisson: Inter-arrival times of process are .
Code cell 19
# === 3.3 Exponential Distribution ===
lambda_ = 2.0
rv = stats.expon(scale=1/lambda_)
print(f'Exponential(lambda={lambda_})')
print(f'E[X] = {rv.mean():.4f} (theory: {1/lambda_:.4f})')
print(f'Var(X) = {rv.var():.4f} (theory: {1/lambda_**2:.4f})')
print(f'CV = {np.sqrt(rv.var())/rv.mean():.4f} (theory: 1.0000)')
def check(name, got, expected, tol=1e-8):
ok = np.allclose(got, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} — {name}")
return ok
check('E[X]', rv.mean(), 1/lambda_)
check('Var(X)', rv.var(), 1/lambda_**2)
# Memoryless property
s, t = 1.0, 2.0
lhs = rv.sf(s + t) / rv.sf(s)
rhs = rv.sf(t)
check(f'Memoryless: P(X>{s+t}|X>{s}) = P(X>{t})', lhs, rhs)
# Simulate Poisson inter-arrival times
n_events = 10000
inter_arrivals = rv.rvs(n_events)
ks_stat, p_val = stats.kstest(inter_arrivals, rv.cdf)
print(f'\nPoisson inter-arrival simulation: KS p={p_val:.4f}')
check('KS test passes', p_val > 0.05, True)
3.4 Gamma Distribution
Sum of independent random variables.
Moments: ,
Special cases:
- :
- , : (chi-squared)
Additivity: for iid
For AI: Conjugate prior for Poisson rate; models waiting times in queuing theory; used in variational inference and normalising flows.
Code cell 21
# === 3.4 Gamma Distribution ===
from scipy.special import gamma as gamma_fn
alpha_vals = [0.5, 1, 2, 5, 10]
beta = 1.0
print('Gamma distribution moments:')
print(f"{'alpha':>8} {'E[X]':>8} {'Var(X)':>10}")
for a in alpha_vals:
rv = stats.gamma(a, scale=1/beta)
print(f"{a:>8.1f} {rv.mean():>8.4f} {rv.var():>10.4f}")
def check(name, got, expected, tol=1e-6):
ok = np.allclose(got, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} — {name}")
return ok
# Chi-squared connection: Gamma(n/2, 1/2)
n_df = 4
gamma_rv = stats.gamma(n_df/2, scale=2)
chi2_rv = stats.chi2(n_df)
x_test = np.array([1.0, 2.0, 4.0, 6.0])
check('Gamma(n/2,1/2) = Chi2(n) CDF', gamma_rv.cdf(x_test), chi2_rv.cdf(x_test))
# Additivity
n_sim = 50000
a = 2.0; b = 1.0
X = stats.gamma(a, scale=1/b).rvs(n_sim)
Y = stats.gamma(a, scale=1/b).rvs(n_sim)
Z = X + Y
expected_rv = stats.gamma(2*a, scale=1/b)
print(f'\nAdditivity: X+Y ~ Gamma({2*a}, {b})?')
print(f' Z: mean={Z.mean():.4f}, var={Z.var():.4f}')
print(f' Theory: mean={expected_rv.mean():.4f}, var={expected_rv.var():.4f}')
check('Gamma additivity mean', Z.mean(), expected_rv.mean(), tol=0.05)
if HAS_MPL:
fig, ax = plt.subplots(figsize=(10, 5))
x_range = np.linspace(0, 20, 400)
colors = plt.cm.plasma(np.linspace(0.1, 0.9, len(alpha_vals)))
for a, c in zip(alpha_vals, colors):
rv = stats.gamma(a, scale=1/beta)
ax.plot(x_range, rv.pdf(x_range), lw=2, color=c, label=f'α={a}')
ax.set_xlim(0, 20); ax.set_xlabel('x'); ax.set_ylabel('f(x)')
ax.set_title('Gamma PDFs (β=1)'); ax.legend()
plt.tight_layout(); plt.show()
3.5 Beta Distribution
Models probabilities (values in ).
where .
Moments: ,
Shape patterns: → Uniform; → symmetric bell; → right-skewed; → U-shaped (bimodal at boundaries)
For AI: Conjugate prior for Bernoulli/Binomial; models mixture weights, topic proportions in LDA, and attention dropout rates.
Code cell 23
# === 3.5 Beta Distribution ===
configs = [(0.5, 0.5), (1, 1), (2, 5), (5, 2), (2, 2), (10, 10)]
print('Beta distribution moments:')
print(f"{'(α,β)':>12} {'E[X]':>8} {'Var(X)':>10} {'mode':>10}")
for a, b in configs:
rv = stats.beta(a, b)
mode = (a-1)/(a+b-2) if a > 1 and b > 1 else 'boundary'
mode_str = f'{mode:.4f}' if isinstance(mode, float) else mode
print(f"({a},{b}):{'':>4} {rv.mean():>8.4f} {rv.var():>10.6f} {mode_str:>10}")
def check(name, got, expected, tol=1e-6):
ok = np.allclose(got, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} — {name}")
return ok
a, b = 3.0, 5.0
rv = stats.beta(a, b)
check('Beta mean', rv.mean(), a/(a+b))
check('Beta variance', rv.var(), a*b/((a+b)**2*(a+b+1)))
if HAS_MPL:
fig, axes = plt.subplots(2, 3, figsize=(14, 8))
x_range = np.linspace(0.001, 0.999, 400)
colors = plt.cm.viridis(np.linspace(0.1, 0.9, len(configs)))
for (a, b), ax, c in zip(configs, axes.flat, colors):
rv = stats.beta(a, b)
ax.plot(x_range, rv.pdf(x_range), lw=2, color=c)
ax.set_title(f'Beta(α={a}, β={b})')
ax.set_xlim(0, 1); ax.set_ylim(bottom=0)
ax.set_xlabel('x'); ax.set_ylabel('f(x)')
plt.suptitle('Beta Distribution Shape Patterns', fontsize=14)
plt.tight_layout(); plt.show()
3.6 Dirichlet Distribution
Multivariate generalisation of Beta; models probability vectors.
Moments: where
Concentration parameter :
- : sparse — mass concentrated near corners (few active topics)
- : uniform concentration
- : dense — mass near the centroid
For AI: LDA topic models use Dirichlet priors over topic distributions; Dirichlet is the conjugate prior for Categorical/Multinomial.
Code cell 25
# === 3.6 Dirichlet Distribution ===
K = 5
n_sim = 5000
# Various concentration regimes
alpha_configs = [
('sparse (α=0.1)', np.ones(K) * 0.1),
('uniform (α=1)', np.ones(K) * 1.0),
('dense (α=10)', np.ones(K) * 10.0),
('asymmetric', np.array([5, 2, 1, 1, 0.5])),
]
def check(name, got, expected, tol=1e-6):
ok = np.allclose(got, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} — {name}")
return ok
print('Dirichlet: mean vs theory')
for name, alpha in alpha_configs:
samples = np.random.dirichlet(alpha, n_sim)
alpha0 = alpha.sum()
theory_mean = alpha / alpha0
print(f' {name}: sim_mean={samples.mean(0).round(3)}, theory={theory_mean.round(3)}')
# Verify: marginals are Beta
alpha = np.array([2.0, 3.0, 5.0, 1.0])
K = len(alpha)
samples = np.random.dirichlet(alpha, 20000)
# X_1 ~ Beta(alpha_1, alpha_0 - alpha_1)
alpha0 = alpha.sum()
beta_rv = stats.beta(alpha[0], alpha0 - alpha[0])
ks_stat, p_val = stats.kstest(samples[:, 0], beta_rv.cdf)
print(f'\nDirichlet marginal ~ Beta({alpha[0]},{alpha0-alpha[0]})?')
print(f' KS test: stat={ks_stat:.4f}, p={p_val:.4f}')
check('Dirichlet marginal is Beta', p_val > 0.05, True)
if HAS_MPL:
fig, axes = plt.subplots(1, 3, figsize=(14, 4))
for (name, alpha), ax in zip(alpha_configs[:3], axes):
samples_2d = np.random.dirichlet(alpha[:3], 2000)
ax.scatter(samples_2d[:, 0], samples_2d[:, 1], alpha=0.3, s=5, c='steelblue')
ax.set_xlabel('x₁'); ax.set_ylabel('x₂')
ax.set_title(f'Dir: {name}')
plt.tight_layout(); plt.show()
3.7 Student-t Distribution
Arises naturally in Bayesian inference and as ratio of Gaussian to chi-squared.
Moments: (), ()
Heavy tails: Polynomial decay vs Gaussian exponential decay
Limits: → Cauchy (no mean); →
For AI: Robust regression; heavy-tailed priors in Bayesian neural networks; t-SNE uses Student-t kernel in the low-dimensional space to model cluster structure.
Code cell 27
# === 3.7 Student-t Distribution ===
nu_vals = [1, 2, 5, 30]
x_range = np.linspace(-6, 6, 400)
def check(name, got, expected, tol=0.05):
ok = abs(got - expected) < tol
print(f"{'PASS' if ok else 'FAIL'} — {name}: {got:.4f}")
print('Student-t tail weight comparison (P(|X|>3)):')
print(f"{'nu':>6} {'P(|X|>3)':>12} {'vs Normal':>12}")
normal_tail = 2 * stats.norm.sf(3)
for nu in nu_vals:
rv = stats.t(nu)
tail = 2 * rv.sf(3)
ratio = tail / normal_tail
var = nu/(nu-2) if nu > 2 else float('inf')
print(f"{nu:>6} {tail:>12.6f} {ratio:>10.1f}x Var={var:.3f}")
# t -> Normal as nu -> inf
nu = 100
t_rv = stats.t(nu)
norm_rv = stats.norm()
x_test = np.array([-2.0, -1.0, 0.0, 1.0, 2.0])
diff = np.abs(t_rv.cdf(x_test) - norm_rv.cdf(x_test))
print(f'\nt(nu=100) vs N(0,1): max CDF diff = {diff.max():.6f}')
check('t(100) ~ Normal: max CDF diff < 0.001', diff.max(), 0.0, tol=0.001)
if HAS_MPL:
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(x_range, stats.norm.pdf(x_range), 'k-', lw=2.5, label='N(0,1)')
colors = plt.cm.plasma(np.linspace(0.1, 0.85, len(nu_vals)))
for nu, c in zip(nu_vals, colors):
ax.plot(x_range, stats.t(nu).pdf(x_range), '--', lw=1.5, color=c, label=f't(ν={nu})')
ax.set_xlim(-6, 6); ax.set_xlabel('x'); ax.set_ylabel('f(x)')
ax.set_title('Student-t vs Gaussian: Heavy Tails'); ax.legend()
plt.tight_layout(); plt.show()
4. Moment Generating Functions
The moment generating function encodes all moments:
Key MGFs:
| Distribution | MGF |
|---|---|
| for | |
| for |
Product rule: If , then
Code cell 29
# === 4. Moment Generating Functions ===
import numpy as np
from scipy import stats
def check(name, got, expected, tol=1e-6):
ok = np.allclose(got, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} — {name}")
return ok
# Verify MGF moments for Gaussian
mu, sigma = 1.5, 2.0
rv = stats.norm(loc=mu, scale=sigma)
# MGF of N(mu, sigma^2) = exp(mu*t + sigma^2*t^2/2)
# Derivatives at t=0:
# M'(0) = mu
# M''(0) = sigma^2 + mu^2 = E[X^2]
E1 = mu
E2 = sigma**2 + mu**2
E3 = mu**3 + 3*mu*sigma**2 # skewness = 0 for N, but E[X^3] = mu^3 + 3mu*sigma^2
# Verify with samples
n = 500000
X = rv.rvs(n)
print('Gaussian MGF moment verification:')
check('E[X] = mu', X.mean(), E1, tol=0.01)
check('E[X^2] = sigma^2 + mu^2', (X**2).mean(), E2, tol=0.1)
check('E[X^3] = mu^3 + 3*mu*sigma^2', (X**3).mean(), E3, tol=0.5)
# Sum of Gaussians via MGF product rule
mu1, s1 = 1.0, 1.0
mu2, s2 = 2.0, 1.5
X1 = np.random.normal(mu1, s1, n)
X2 = np.random.normal(mu2, s2, n)
Z = X1 + X2
# By product rule: N(mu1+mu2, s1^2+s2^2)
print(f'\nSum via MGF product rule: N({mu1+mu2}, {s1**2+s2**2})?')
print(f' Z: mean={Z.mean():.4f}, var={Z.var():.4f}')
check('Sum mean', Z.mean(), mu1+mu2, tol=0.02)
check('Sum var', Z.var(), s1**2+s2**2, tol=0.1)
# Poisson MGF: E[X] = lambda, Var[X] = lambda (from MGF derivatives)
lam = 3.5
pois_samples = np.random.poisson(lam, n)
print(f'\nPoisson({lam}) via MGF: E[X]=Var[X]=lambda')
check('Poisson mean', pois_samples.mean(), lam, tol=0.05)
check('Poisson var', pois_samples.var(), lam, tol=0.05)
5. Exponential Family
Many common distributions share a unifying canonical form:
where:
- : natural (canonical) parameters
- : sufficient statistics
- : log-partition function
- : base measure
Log-partition theorem:
Members: Gaussian, Exponential, Gamma, Beta, Poisson, Binomial, Bernoulli, Categorical, Dirichlet
For AI: GLMs, ELBO in VAEs, natural gradient descent, sufficient statistics in online learning.
Code cell 31
# === 5. Exponential Family ===
import numpy as np
from scipy import stats
def check(name, got, expected, tol=0.05):
ok = abs(got - expected) < tol
print(f"{'PASS' if ok else 'FAIL'} — {name}: {got:.6f}")
# Gaussian as exponential family:
# eta = (mu/sigma^2, -1/(2*sigma^2))
# T(x) = (x, x^2)
# A(eta) = -eta1^2/(4*eta2) - 0.5*log(-2*eta2)
mu, sigma = 2.0, 1.5
eta1 = mu / sigma**2
eta2 = -1 / (2 * sigma**2)
def A_gaussian(eta1, eta2):
return -eta1**2 / (4*eta2) - 0.5 * np.log(-2*eta2)
# E[T(X)] = (E[X], E[X^2]) = (mu, sigma^2 + mu^2)
# Verify via gradient of A
eps = 1e-7
dA_deta1 = (A_gaussian(eta1+eps, eta2) - A_gaussian(eta1-eps, eta2)) / (2*eps)
dA_deta2 = (A_gaussian(eta1, eta2+eps) - A_gaussian(eta1, eta2-eps)) / (2*eps)
print('Gaussian exponential family: log-partition gradients = moments')
print(f' dA/d_eta1 = {dA_deta1:.6f} (E[X] = {mu:.6f})')
print(f' dA/d_eta2 = {dA_deta2:.6f} (E[X^2] = {sigma**2+mu**2:.6f})')
check('dA/d_eta1 = E[X]', dA_deta1, mu)
check('dA/d_eta2 = E[X^2]', dA_deta2, sigma**2+mu**2)
# Poisson as exponential family:
# eta = log(lambda), T(x) = x, A(eta) = e^eta = lambda
# E[T(X)] = E[X] = lambda = e^eta = dA/d_eta
lam = 4.0
eta_pois = np.log(lam)
A_pois = np.exp(eta_pois)
dA_deta_pois = A_pois # derivative of e^eta = e^eta
print(f'\nPoisson exponential family:')
print(f' eta = log(lambda) = {eta_pois:.4f}')
print(f' dA/d_eta = e^eta = {dA_deta_pois:.4f} (E[X] = {lam:.4f})')
check('Poisson: dA/d_eta = lambda', dA_deta_pois, lam, tol=1e-10)
6. Conjugate Priors
A prior is conjugate to likelihood if the posterior is in the same family as the prior.
6.1 Beta-Bernoulli (Beta-Binomial)
- Prior:
- Likelihood:
- Posterior:
Interpretation: = prior successes, = prior failures; data adds counts.
6.2 Gamma-Poisson
- Prior:
- Likelihood:
- Posterior:
6.3 Dirichlet-Categorical
- Prior:
- Likelihood:
- Posterior:
where is the count of category in the data.
Code cell 33
# === 6. Conjugate Priors ===
import numpy as np
from scipy import stats
try:
import matplotlib.pyplot as plt
HAS_MPL = True
except ImportError:
HAS_MPL = False
def check(name, got, expected, tol=1e-6):
ok = np.allclose(got, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} — {name}")
return ok
# Beta-Bernoulli: coin flip example
np.random.seed(42)
true_p = 0.7
alpha_prior, beta_prior = 2.0, 2.0 # weakly informative prior
n_obs = 20
obs = np.random.binomial(1, true_p, n_obs)
n_heads = obs.sum()
n_tails = n_obs - n_heads
alpha_post = alpha_prior + n_heads
beta_post = beta_prior + n_tails
prior_rv = stats.beta(alpha_prior, beta_prior)
post_rv = stats.beta(alpha_post, beta_post)
mle_p = n_heads / n_obs
post_mean = alpha_post / (alpha_post + beta_post)
print(f'Beta-Bernoulli Conjugate Update')
print(f' Prior: Beta({alpha_prior}, {beta_prior}) mean={prior_rv.mean():.4f}')
print(f' Data: n={n_obs}, heads={n_heads}')
print(f' Posterior: Beta({alpha_post}, {beta_post}) mean={post_mean:.4f}')
print(f' MLE: p = {mle_p:.4f}')
print(f' True p: {true_p}')
# Verify: posterior mode = MAP estimate
post_mode = (alpha_post - 1) / (alpha_post + beta_post - 2)
print(f' MAP estimate: {post_mode:.4f}')
# Gamma-Poisson update
alpha_g, beta_g = 3.0, 1.0 # prior: Gamma(3,1), mean=3
true_lam = 5.0
pois_data = np.random.poisson(true_lam, 15)
alpha_post_g = alpha_g + pois_data.sum()
beta_post_g = beta_g + len(pois_data)
post_lam_mean = alpha_post_g / beta_post_g
print(f'\nGamma-Poisson Conjugate Update')
print(f' Prior mean = {alpha_g/beta_g:.4f}')
print(f' Data sum = {pois_data.sum()}, n={len(pois_data)}')
print(f' Posterior mean = {post_lam_mean:.4f} (true λ={true_lam})')
if HAS_MPL:
x = np.linspace(0, 1, 400)
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(x, prior_rv.pdf(x), 'b--', lw=2, label=f'Prior Beta({alpha_prior},{beta_prior})')
ax.plot(x, post_rv.pdf(x), 'r-', lw=2, label=f'Posterior Beta({alpha_post},{beta_post})')
ax.axvline(mle_p, color='g', ls=':', lw=2, label=f'MLE={mle_p:.3f}')
ax.axvline(true_p, color='k', ls='-', lw=1.5, label=f'True p={true_p}')
ax.set_xlabel('p'); ax.set_ylabel('Density')
ax.set_title('Beta-Bernoulli Conjugate Update'); ax.legend()
plt.tight_layout(); plt.show()
7. ML Applications
7.1 Language Model Output Distribution
An LLM's output at each step is where are logits and is temperature.
- : argmax (greedy) — deterministic
- : standard softmax
- : Uniform over vocabulary — maximum entropy
7.2 KL Divergence Between Gaussians (VAE)
For and :
This is the VAE regularisation term that keeps the latent space compact.
Code cell 35
# === 7. ML Applications ===
import numpy as np
from scipy import stats
try:
import matplotlib.pyplot as plt
HAS_MPL = True
except ImportError:
HAS_MPL = False
def check(name, got, expected, tol=1e-6):
ok = np.allclose(got, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} — {name}")
# 7.1 Temperature scaling and entropy
def softmax(logits, T=1.0):
z = logits / T
z = z - z.max()
e = np.exp(z)
return e / e.sum()
K = 10
np.random.seed(0)
logits = np.random.randn(K)
print('Temperature vs Entropy:')
print(f"{'T':>6} {'entropy':>10} {'max_prob':>10}")
for T in [0.01, 0.1, 0.5, 1.0, 2.0, 10.0]:
p = softmax(logits, T)
H = -np.sum(p * np.log(p + 1e-12))
print(f"{T:>6.2f} {H:>10.4f} {p.max():>10.4f}")
# Uniform has max entropy log(K)
max_entropy = np.log(K)
high_T_p = softmax(logits, T=1000)
H_high = -np.sum(high_T_p * np.log(high_T_p + 1e-12))
check('High T -> uniform entropy', H_high, max_entropy, tol=0.01)
# 7.2 VAE KL divergence
def kl_gaussian_to_standard(mu, log_var):
sigma2 = np.exp(log_var)
return 0.5 * (mu**2 + sigma2 - log_var - 1)
print('\nVAE KL divergence KL(N(mu,sigma^2) || N(0,1)):')
test_cases = [(0.0, 0.0), (1.0, 0.0), (0.0, 1.0), (2.0, np.log(4.0))]
for mu, lv in test_cases:
kl = kl_gaussian_to_standard(mu, lv)
# Verify: KL=0 iff mu=0, sigma^2=1 (log_var=0)
sigma2 = np.exp(lv)
# KL via formula
kl_exact = 0.5*(mu**2 + sigma2 - lv - 1)
print(f' mu={mu}, log_var={lv:.2f}, sigma^2={sigma2:.2f}: KL={kl:.4f}')
check('KL=0 at mu=0, sigma^2=1', kl_gaussian_to_standard(0.0, 0.0), 0.0)
# 7.3 Cross-entropy loss = negative log-likelihood of Categorical
# For one-hot y, CE(y, p) = -log p_y
def cross_entropy(logits, true_class):
p = softmax(logits)
return -np.log(p[true_class] + 1e-12)
logits_example = np.array([3.0, 1.0, 0.2, -0.5])
true_class = 0
ce = cross_entropy(logits_example, true_class)
p_correct = softmax(logits_example)[true_class]
print(f'\nCross-entropy loss: {ce:.4f}')
print(f'P(correct class) = {p_correct:.4f}')
check('CE = -log(p_correct)', ce, -np.log(p_correct))
8. Distribution Relationships
Visualising the web of connections between common distributions.
Key Limiting Relationships
- Binomial → Poisson: , ,
- Binomial → Gaussian: (CLT),
- Gamma → Gaussian: (CLT),
- Beta → Gaussian: proportionally
- t) → Gaussian:
- Geometric → Exponential: rescaled, continuous limit
Special Cases
- Exponential = Gamma(1, β)
- Chi-squared(n) = Gamma(n/2, 1/2)
- Beta(1,1) = Uniform(0,1)
- Bernoulli(p) = Binomial(1,p)
- Categorical(K=2) = Bernoulli
- Dirichlet(K=2) = Beta
Code cell 37
# === 8. Distribution Convergence Demonstration ===
import numpy as np
from scipy import stats
try:
import matplotlib.pyplot as plt
HAS_MPL = True
except ImportError:
HAS_MPL = False
def check(name, got, expected, tol=0.02):
ok = abs(got - expected) < tol
print(f"{'PASS' if ok else 'FAIL'} — {name}: {got:.6f}")
# Gamma → Normal as alpha → inf
alphas = [1, 2, 5, 20, 100]
beta = 1.0
n_sim = 50000
print('Gamma → Gaussian convergence (skewness → 0):')
for a in alphas:
samples = np.random.gamma(a, 1/beta, n_sim)
sk = stats.skew(samples)
print(f' Gamma({a:>3}, {beta}): skew={sk:.4f} (theory: {2/np.sqrt(a):.4f})')
# For Gamma(alpha, beta): skewness = 2/sqrt(alpha) -> 0 as alpha -> inf
a = 100
samples = np.random.gamma(a, 1/beta, n_sim)
check('Gamma(100) skewness near 0', stats.skew(samples), 0.0, tol=0.1)
if HAS_MPL:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
# Binomial -> Poisson
ax = axes[0]
lam = 3.0
k = np.arange(0, 15)
ax.bar(k - 0.3, stats.binom(1000, lam/1000).pmf(k), width=0.25, alpha=0.7, label='Binom(1000,0.003)')
ax.bar(k + 0.0, stats.poisson(lam).pmf(k), width=0.25, alpha=0.7, label=f'Poisson({lam})')
ax.set_title('Binomial → Poisson'); ax.legend(); ax.set_xlabel('k')
# Gamma(alpha) -> Normal
ax = axes[1]
x = np.linspace(-2, 8, 400)
for a, alpha in zip(alphas[1:], [0.3, 0.5, 0.7, 0.9]):
mu_g = a/beta; s_g = np.sqrt(a/beta**2)
z = (stats.gamma(a, scale=1/beta).pdf(x))
ax.plot((x - mu_g)/s_g, z * s_g, lw=1.5, alpha=alpha, label=f'Gamma({a})')
ax.plot(np.linspace(-4, 4, 400), stats.norm.pdf(np.linspace(-4, 4, 400)), 'k--', lw=2, label='N(0,1)')
ax.set_xlim(-4, 4); ax.set_title('Gamma → Normal (standardised)'); ax.legend()
# t(nu) -> Normal
ax = axes[2]
x = np.linspace(-5, 5, 400)
for nu, alpha in zip([1, 2, 5, 30], [0.4, 0.6, 0.8, 1.0]):
ax.plot(x, stats.t(nu).pdf(x), lw=1.5, alpha=alpha, label=f't(ν={nu})')
ax.plot(x, stats.norm.pdf(x), 'k--', lw=2, label='N(0,1)')
ax.set_xlim(-5, 5); ax.set_title('t(ν) → Normal'); ax.legend()
plt.tight_layout(); plt.show()
Summary
| Family | Key distributions | AI use case |
|---|---|---|
| Binary | Bernoulli, Binomial | Binary classification, attention masking |
| Count | Poisson, Geometric, NegBin | Token counts, sequence length |
| Categorical | Categorical, Multinomial | Next-token prediction, LDA |
| Location | Gaussian, Student-t | Weights, activations, robust regression |
| Positive | Exponential, Gamma | Inter-arrival times, conjugate Poisson prior |
| Probability | Beta, Dirichlet | Bayesian updating, topic models |
Key unifying insight: All these distributions are members of the exponential family, which enables closed-form Bayesian updating with conjugate priors and efficient natural gradient optimisation in machine learning.
See exercises.ipynb to practise these concepts.