Theory Notebook
Converted from
theory.ipynbfor web reading.
Estimation Theory
"The problem of statistical estimation is one of the most fundamental in all of science: given observations drawn from some unknown process, what can we infer about that process?" — Sir Ronald A. Fisher
This notebook provides interactive implementations of the core results in estimation theory:
- Bias-variance decomposition and MSE
- Fisher information and the Cramér-Rao bound
- Maximum likelihood estimation for common distributions
- Asymptotic normality simulation
- Confidence intervals (exact, Wald, bootstrap)
- Natural gradient and Fisher information in ML
Companion notes: notes.md | Exercises: exercises.ipynb
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import numpy as np
import scipy.linalg as la
from scipy import stats, optimize
from scipy.special import gammaln
try:
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style='whitegrid', palette='colorblind')
HAS_SNS = True
except ImportError:
plt.style.use('seaborn-v0_8-whitegrid')
HAS_SNS = False
mpl.rcParams.update({
'figure.figsize': (10, 6),
'figure.dpi': 120,
'font.size': 13,
'axes.titlesize': 15,
'axes.labelsize': 13,
'xtick.labelsize': 11,
'ytick.labelsize': 11,
'legend.fontsize': 11,
'legend.framealpha': 0.85,
'lines.linewidth': 2.0,
'axes.spines.top': False,
'axes.spines.right': False,
'savefig.bbox': 'tight',
})
HAS_MPL = True
except ImportError:
HAS_MPL = False
COLORS = {
'primary': '#0077BB',
'secondary': '#EE7733',
'tertiary': '#009988',
'error': '#CC3311',
'neutral': '#555555',
'highlight': '#EE3377',
}
np.set_printoptions(precision=6, suppress=True)
np.random.seed(42)
print('Setup complete. Libraries loaded.')
1. Intuition: Estimators as Random Variables
Before computing anything, let us visualise the core concept: an estimator is a random variable whose sampling distribution tells us how reliable our estimates are. We simulate the sampling distribution of the sample mean for different sample sizes.
Code cell 5
# === 1.1 Sampling Distribution of the Sample Mean ===
# True population: N(mu, sigma^2)
mu_true = 5.0
sigma_true = 2.0
M = 5000 # Number of simulated experiments
sample_sizes = [5, 20, 100, 500]
xbar_distributions = {}
for n in sample_sizes:
# Simulate M experiments, each drawing n samples
samples = np.random.normal(mu_true, sigma_true, size=(M, n))
xbars = samples.mean(axis=1)
xbar_distributions[n] = xbars
print(f'n={n:4d}: mean(xbar)={xbars.mean():.4f}, std(xbar)={xbars.std():.4f}, '
f'expected std={sigma_true/np.sqrt(n):.4f}')
print('\nKey: std(xbar) = sigma/sqrt(n) -- variance halves when n quadruples')
Code cell 6
# === 1.2 Visualise Sampling Distributions ===
if HAS_MPL:
fig, axes = plt.subplots(1, 4, figsize=(16, 4))
fig.suptitle('Sampling distribution of the sample mean $\\bar{x}_n$\n'
'True parameter: $\\mu^* = 5$, $\\sigma = 2$', fontsize=14)
x_range = np.linspace(mu_true - 4, mu_true + 4, 200)
colors_list = [COLORS['primary'], COLORS['secondary'],
COLORS['tertiary'], COLORS['highlight']]
for ax, n, col in zip(axes, sample_sizes, colors_list):
xbars = xbar_distributions[n]
ax.hist(xbars, bins=50, density=True, alpha=0.6, color=col, edgecolor='white')
# Theoretical N(mu, sigma^2/n)
pdf = stats.norm.pdf(x_range, mu_true, sigma_true/np.sqrt(n))
ax.plot(x_range, pdf, color=COLORS['neutral'], lw=2, label='CLT approx')
ax.axvline(mu_true, color=COLORS['error'], lw=1.5, linestyle='--', label='$\\mu^*$')
ax.set_title(f'$n = {n}$\n$\\sigma_{{\\bar{{x}}}}={sigma_true/np.sqrt(n):.2f}$')
ax.set_xlabel('$\\bar{x}$')
ax.set_ylabel('Density' if n == 5 else '')
if n == 5:
ax.legend(fontsize=9)
fig.tight_layout()
plt.show()
print('Sampling distribution narrows as 1/sqrt(n) -- the CLT in action')
2. Bias-Variance Decomposition
The MSE of any estimator decomposes as:
We verify this numerically and visualise the trade-off.
Code cell 8
# === 2.1 MSE = Bias^2 + Variance ===
theta_true = 3.0
sigma = 1.0
n = 20
M = 10000
# Three estimators for theta (Gaussian mean, sigma known)
def estimator_1(data): # Sample mean (unbiased, efficient)
return data.mean(axis=1)
def estimator_2(data): # Constant 2.5 (biased but low variance)
return np.full(len(data), 2.5)
def estimator_3(data): # First observation (unbiased, inconsistent)
return data[:, 0]
def estimator_4(c): # Shrinkage estimator c*xbar
def est(data):
return c * data.mean(axis=1)
return est
# Generate data
data_all = np.random.normal(theta_true, sigma, size=(M, n))
estimators = {
'Sample mean': estimator_1,
'Constant 2.5': estimator_2,
'First obs': estimator_3,
'0.9*xbar': estimator_4(0.9),
}
print(f'True theta = {theta_true}, n = {n}, sigma = {sigma}')
print(f'{"Estimator":<15} {"E[theta_hat]":>12} {"Bias":>8} {"Var":>10} {"MSE":>10}')
print('-' * 58)
results = {}
for name, est in estimators.items():
estimates = est(data_all)
bias = estimates.mean() - theta_true
var = estimates.var()
mse = ((estimates - theta_true)**2).mean()
mse_decomp = bias**2 + var
results[name] = {'bias': bias, 'var': var, 'mse': mse}
print(f'{name:<15} {estimates.mean():>12.4f} {bias:>8.4f} {var:>10.4f} {mse:>10.4f}')
ok = np.isclose(mse, mse_decomp, atol=1e-3)
print(f' PASS MSE = Bias^2 + Var? {ok}: {mse:.4f} ≈ {bias**2:.4f} + {var:.4f} = {mse_decomp:.4f}')
print('\nConclusion: Sample mean has smallest MSE among unbiased estimators')
Code cell 9
# === 2.2 Bias-Variance Trade-off: Shrinkage Estimator ===
# Shrinkage: hat_theta(c) = c * xbar, c in [0,1]
# MSE(c) = (c-1)^2 * theta^2 + c^2 * sigma^2/n
c_values = np.linspace(0, 1.2, 200)
mse_theory = (c_values - 1)**2 * theta_true**2 + c_values**2 * sigma**2 / n
bias2_theory = (c_values - 1)**2 * theta_true**2
var_theory = c_values**2 * sigma**2 / n
c_optimal = theta_true**2 / (theta_true**2 + sigma**2/n)
mse_optimal = mse_theory[np.argmin(mse_theory)]
mse_ols = sigma**2 / n # OLS (c=1): MSE = sigma^2/n
print(f'Optimal shrinkage: c* = {c_optimal:.4f}')
print(f'MSE at c* = {mse_optimal:.6f}')
print(f'MSE at c=1 (OLS) = {mse_ols:.6f}')
print(f'Improvement: {100*(mse_ols - mse_optimal)/mse_ols:.1f}% reduction in MSE')
if HAS_MPL:
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(c_values, mse_theory, color=COLORS['primary'], lw=2.5, label='MSE = Bias² + Var')
ax.plot(c_values, bias2_theory, color=COLORS['error'], lw=1.5,
linestyle='--', label='Bias²')
ax.plot(c_values, var_theory, color=COLORS['secondary'], lw=1.5,
linestyle=':', label='Variance')
ax.axvline(1.0, color=COLORS['neutral'], lw=1, linestyle='--', label='Unbiased (c=1)')
ax.axvline(c_optimal, color=COLORS['highlight'], lw=1.5, linestyle='-.', label=f'Optimal c*={c_optimal:.2f}')
ax.scatter([c_optimal], [mse_optimal], color=COLORS['highlight'], s=80, zorder=5)
ax.set_title('Bias-variance trade-off for shrinkage estimator $c\\bar{x}$')
ax.set_xlabel('Shrinkage factor $c$')
ax.set_ylabel('MSE')
ax.legend()
ax.set_xlim(0, 1.2)
fig.tight_layout()
plt.show()
3. Fisher Information and the Cramér-Rao Bound
The Fisher information measures how much data informs the parameter. The CRB states: for any unbiased estimator.
Code cell 11
# === 3.1 Fisher Information: Computation and Interpretation ===
# Fisher information I(theta) = Var(score) = -E[d^2 log p / dtheta^2]
# 1. Bernoulli(p)
# log p(x;p) = x*log(p) + (1-x)*log(1-p)
# score = x/p - (1-x)/(1-p)
# I(p) = 1 / (p*(1-p))
p_grid = np.linspace(0.01, 0.99, 200)
I_bernoulli = 1.0 / (p_grid * (1 - p_grid))
print('Bernoulli Fisher Information I(p):')
for p in [0.1, 0.3, 0.5, 0.7, 0.9]:
I = 1.0 / (p * (1-p))
crb = p*(1-p) # CRB variance for n=1
print(f' p={p:.1f}: I(p) = {I:.3f}, CRB = p(1-p) = {crb:.3f}')
print()
# 2. N(mu, sigma^2) with sigma known
# I(mu) = 1 / sigma^2
sigma = 2.0
I_gaussian_mu = 1.0 / sigma**2
print(f'Gaussian N(mu, sigma^2={sigma**2}) with sigma known:')
print(f' I(mu) = 1/sigma^2 = {I_gaussian_mu:.4f}')
print(f' CRB for n samples: sigma^2/n = {sigma**2}/n')
print()
# 3. Poisson(lambda)
# I(lambda) = 1 / lambda
for lam in [1, 2, 5, 10]:
I_pois = 1.0 / lam
print(f' Poisson(lambda={lam}): I(lambda)={I_pois:.4f}, CRB variance = lambda/n = {lam}/n')
print()
print('PASS - Fisher information computed for Bernoulli, Gaussian, Poisson')
Code cell 12
# === 3.2 Visualise Fisher Information for Bernoulli ===
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(13, 5))
# Left: I(p) for Bernoulli
axes[0].plot(p_grid, I_bernoulli, color=COLORS['primary'], lw=2.5)
axes[0].set_title('Fisher information $\\mathcal{I}(p)$ for Bernoulli$(p)$')
axes[0].set_xlabel('Parameter $p$')
axes[0].set_ylabel('$\\mathcal{I}(p) = 1/[p(1-p)]$')
axes[0].axvline(0.5, color=COLORS['neutral'], lw=1, linestyle='--',
label='$p=0.5$ (minimum info)')
axes[0].legend()
# Right: Log-likelihood shape (shows curvature = Fisher info)
x_obs = np.array([1, 1, 0, 1, 1, 0, 1, 1, 1, 0]) # 7 heads, 3 tails
S = x_obs.sum()
n_obs = len(x_obs)
log_lik = S * np.log(p_grid) + (n_obs - S) * np.log(1 - p_grid)
p_hat = S / n_obs
axes[1].plot(p_grid, log_lik, color=COLORS['secondary'], lw=2.5)
axes[1].axvline(p_hat, color=COLORS['error'], lw=1.5, linestyle='--',
label=f'MLE $\\hat{{p}}={p_hat:.1f}$')
axes[1].set_title(f'Log-likelihood: $n={n_obs}$ obs, $S={S}$ successes')
axes[1].set_xlabel('Parameter $p$')
axes[1].set_ylabel('$\\ell(p)$')
axes[1].legend()
fig.tight_layout()
plt.show()
print('Curvature of log-likelihood at MLE ≈ Fisher information')
Code cell 13
# === 3.3 Cramér-Rao Bound Verification ===
# For Bernoulli(p), the CRB is: Var(p_hat) >= p*(1-p)/n
# The sample mean p_hat = xbar achieves this bound exactly.
p_true = 0.3
M = 50000
n_values = [5, 10, 20, 50, 100, 500]
print(f'CRB verification for Bernoulli(p={p_true})')
print(f'{"n":>6} {"CRB":>10} {"Var(phat)":>12} {"Ratio":>8} {"Pass":>6}')
print('-' * 46)
for n in n_values:
samples = np.random.binomial(1, p_true, size=(M, n))
p_hats = samples.mean(axis=1)
crb = p_true * (1 - p_true) / n
emp_var = p_hats.var()
ratio = emp_var / crb
ok = np.isclose(ratio, 1.0, atol=0.05) # within 5% due to Monte Carlo
print(f'{n:>6} {crb:>10.6f} {emp_var:>12.6f} {ratio:>8.4f} {str(ok):>6}')
print('\nRatio ≈ 1.0 means the MLE achieves the CRB (is efficient)')
4. Maximum Likelihood Estimation
We derive and verify MLEs for four distributions, then demonstrate the connection to cross-entropy.
Code cell 15
# === 4.1 MLE for Common Distributions ===
np.random.seed(42)
n = 200
# --- Bernoulli ---
p_true = 0.35
x_bern = np.random.binomial(1, p_true, n)
p_hat = x_bern.mean()
print(f'Bernoulli: true p={p_true}, MLE p_hat={p_hat:.4f}, SE={np.sqrt(p_hat*(1-p_hat)/n):.4f}')
# --- Poisson ---
lam_true = 4.2
x_pois = np.random.poisson(lam_true, n)
lam_hat = x_pois.mean()
print(f'Poisson: true lambda={lam_true}, MLE lambda_hat={lam_hat:.4f}, SE={np.sqrt(lam_hat/n):.4f}')
# --- Exponential ---
lam_exp_true = 2.0
x_exp = np.random.exponential(1/lam_exp_true, n)
lam_exp_hat = 1 / x_exp.mean()
crb_exp = lam_exp_true**2 / n
print(f'Exponential: true lambda={lam_exp_true}, MLE={lam_exp_hat:.4f}, SE={1/x_exp.mean()**2 * np.sqrt(x_exp.var()/n):.4f}')
# --- Gaussian ---
mu_true_g, sigma_true_g = 3.5, 1.8
x_gauss = np.random.normal(mu_true_g, sigma_true_g, n)
mu_hat = x_gauss.mean()
sigma2_hat_mle = x_gauss.var() # 1/n denominator (MLE, biased)
sigma2_hat_unb = x_gauss.var(ddof=1) # 1/(n-1) denominator (unbiased)
print(f'Gaussian: true mu={mu_true_g}, MLE mu_hat={mu_hat:.4f}')
print(f' true sigma^2={sigma_true_g**2:.4f}, MLE sigma2_hat={sigma2_hat_mle:.4f} (biased)')
print(f' unbiased s^2={sigma2_hat_unb:.4f} (n-1 denominator)')
print(f' Bias = {sigma2_hat_mle - sigma_true_g**2:.4f}, expected = {-sigma_true_g**2/n:.4f}')
Code cell 16
# === 4.2 Log-Likelihood Surface for Gaussian ===
if HAS_MPL:
mu_grid = np.linspace(2.5, 4.5, 80)
sigma2_grid = np.linspace(0.5, 6.0, 80)
MU, S2 = np.meshgrid(mu_grid, sigma2_grid)
def gaussian_loglik(mu, sigma2, data):
n_d = len(data)
return -n_d/2 * np.log(sigma2) - np.sum((data - mu)**2) / (2*sigma2)
LL = np.array([[gaussian_loglik(m, s2, x_gauss) for m in mu_grid] for s2 in sigma2_grid])
fig, ax = plt.subplots(figsize=(9, 7))
contour = ax.contourf(MU, S2, LL, levels=30, cmap='viridis')
fig.colorbar(contour, ax=ax, label='Log-likelihood $\\ell(\\mu, \\sigma^2)$')
ax.scatter([mu_hat], [sigma2_hat_mle], color=COLORS['error'], s=120,
marker='*', zorder=5, label=f'MLE: $\\hat\\mu={mu_hat:.2f}$, $\\hat\\sigma^2={sigma2_hat_mle:.2f}$')
ax.scatter([mu_true_g], [sigma_true_g**2], color='white', s=80,
marker='x', zorder=5, lw=2, label=f'True: $\\mu^*={mu_true_g}$, $\\sigma^2={sigma_true_g**2}$')
ax.set_title('Log-likelihood surface for Gaussian model ($n=200$)')
ax.set_xlabel('$\\mu$')
ax.set_ylabel('$\\sigma^2$')
ax.legend(loc='upper right')
fig.tight_layout()
plt.show()
Code cell 17
# === 4.3 MLE = Cross-Entropy Minimisation ===
# For binary classification: minimise NLL = maximise likelihood
# NLL(theta) = -1/n sum_i [y_i*log(p_i) + (1-y_i)*log(1-p_i)]
# Generate synthetic binary classification data
np.random.seed(42)
n_cls = 300
w_true = np.array([1.5, -2.0])
b_true = 0.5
X_cls = np.random.randn(n_cls, 2)
logits_true = X_cls @ w_true + b_true
y_cls = (np.random.rand(n_cls) < 1/(1+np.exp(-logits_true))).astype(float)
def sigmoid(z):
return 1 / (1 + np.exp(-np.clip(z, -500, 500)))
def nll_loss(params, X, y):
w, b = params[:2], params[2]
p = sigmoid(X @ w + b)
p = np.clip(p, 1e-12, 1-1e-12)
return -np.mean(y * np.log(p) + (1-y) * np.log(1-p))
# Optimise NLL = MLE
params0 = np.zeros(3)
result = optimize.minimize(nll_loss, params0, args=(X_cls, y_cls),
method='L-BFGS-B')
w_hat = result.x[:2]
b_hat = result.x[2]
print('MLE = Cross-entropy minimisation for logistic regression:')
print(f'True weights: w = {w_true}, b = {b_true}')
print(f'MLE estimates: w = {w_hat.round(3)}, b = {b_hat:.3f}')
print(f'Final NLL (training): {result.fun:.4f}')
# Verify: NLL ≡ cross-entropy
p_pred = sigmoid(X_cls @ w_hat + b_hat)
ce = -np.mean(y_cls * np.log(np.clip(p_pred, 1e-12, 1)) + (1-y_cls) * np.log(np.clip(1-p_pred, 1e-12, 1)))
print(f'Cross-entropy = {ce:.4f} (should match NLL = {result.fun:.4f})')
ok = np.isclose(ce, result.fun, atol=1e-6)
print(f'PASS NLL == Cross-entropy: {ok}')
5. Asymptotic Normality of MLE
Under regularity conditions:
We verify this empirically for the Bernoulli MLE.
Code cell 19
# === 5.1 Asymptotic Normality Simulation for Bernoulli MLE ===
import numpy as np
import scipy.stats as stats
np.random.seed(42)
p_true = 0.3
M = 5000
sample_sizes = [10, 30, 100, 500]
# Fisher information: I(p) = 1/(p(1-p))
I_p = 1 / (p_true * (1 - p_true))
asym_var = 1 / I_p # = p(1-p)
asym_std = np.sqrt(asym_var)
print(f'Bernoulli(p={p_true}): I(p) = {I_p:.4f}, asymptotic variance = {asym_var:.4f}')
print()
normality_results = {}
for n in sample_sizes:
# Simulate M MLEs
samples = np.random.binomial(1, p_true, size=(M, n))
p_hats = samples.mean(axis=1)
# Standardise
z_scores = (p_hats - p_true) / (asym_std / np.sqrt(n))
# Kolmogorov-Smirnov test against N(0,1)
ks_stat, ks_pval = stats.kstest(z_scores, 'norm')
normality_results[n] = (p_hats, z_scores, ks_pval)
print(f'n={n:4d}: std(p_hat)={p_hats.std():.5f}, expected={asym_std/np.sqrt(n):.5f}, '
f'KS p-value={ks_pval:.4f}')
print('\nKS p-value > 0.05 means cannot reject normality')
Code cell 20
# === 5.2 Visualise Approach to Normality ===
try:
import matplotlib.pyplot as plt
import matplotlib as mpl
COLORS = {'primary':'#0077BB','secondary':'#EE7733','tertiary':'#009988','highlight':'#EE3377','neutral':'#555555','error':'#CC3311'}
HAS_MPL = True
except ImportError:
HAS_MPL = False
if HAS_MPL:
fig, axes = plt.subplots(1, 4, figsize=(16, 4))
fig.suptitle('Standardised MLE $Z_n = \\sqrt{n}(\\hat{p} - p^*)/\\sqrt{p^*(1-p^*)}$ → N(0,1)',
fontsize=13)
z_range = np.linspace(-4, 4, 200)
normal_pdf = stats.norm.pdf(z_range)
colors_list = ['#0077BB','#EE7733','#009988','#EE3377']
for ax, (n, col) in zip(axes, zip(sample_sizes, colors_list)):
_, z_scores, ks_pval = normality_results[n]
ax.hist(z_scores, bins=50, density=True, alpha=0.6, color=col, edgecolor='white')
ax.plot(z_range, normal_pdf, color=COLORS['neutral'], lw=2, label='N(0,1)')
ax.set_title(f'$n = {n}$\nKS p={ks_pval:.3f}')
ax.set_xlabel('$Z_n$')
if n == 10:
ax.set_ylabel('Density')
ax.legend(fontsize=9)
ax.set_xlim(-4, 4)
fig.tight_layout()
plt.show()
print('By n=100, the distribution is visually indistinguishable from N(0,1)')
Code cell 21
# === 5.3 Delta Method: CI for Log-Odds ===
# If p_hat ~ N(p, p(1-p)/n), then by delta method:
# log(p/(1-p)) = g(p), g'(p) = 1/(p(1-p))
# Var(g(p_hat)) approx [g'(p)]^2 * p(1-p)/n = 1/(n*p(1-p))
p_true = 0.3
n = 100
M = 10000
np.random.seed(42)
samples = np.random.binomial(1, p_true, size=(M, n))
p_hats = samples.mean(axis=1)
log_odds_hats = np.log(p_hats / (1 - p_hats))
# Delta method prediction
log_odds_true = np.log(p_true / (1 - p_true))
var_delta = 1 / (n * p_true * (1 - p_true))
print('Delta method for log-odds transformation:')
print(f'True log-odds: {log_odds_true:.4f}')
print(f'Empirical mean of log-odds MLE: {log_odds_hats.mean():.4f}')
print(f'Empirical Var(log-odds MLE): {log_odds_hats.var():.6f}')
print(f'Delta method prediction: {var_delta:.6f}')
ok = np.isclose(log_odds_hats.var(), var_delta, rtol=0.1)
print(f'PASS delta method variance prediction: {ok}')
6. Confidence Intervals
We construct and compare three types of CIs:
- Exact (pivotal method using Student's )
- Asymptotic (Wald CI using Fisher information)
- Bootstrap (non-parametric resampling)
Code cell 23
# === 6.1 Exact t-Confidence Interval for Gaussian Mean ===
np.random.seed(42)
mu_true = 5.0
sigma_true = 2.0
alpha = 0.05
def t_ci(data, alpha=0.05):
n = len(data)
xbar = data.mean()
s = data.std(ddof=1)
t_crit = stats.t.ppf(1 - alpha/2, df=n-1)
margin = t_crit * s / np.sqrt(n)
return xbar - margin, xbar + margin
# Verify coverage
n_values = [5, 10, 20, 50, 100]
M = 5000
print(f'Coverage verification for 95% t-CI, true mu={mu_true}')
print(f'{"n":>6} {"Coverage":>10} {"Expected":>10} {"Width":>10}')
print('-' * 40)
for n in n_values:
data_sim = np.random.normal(mu_true, sigma_true, size=(M, n))
coverages = []
widths = []
for i in range(M):
lo, hi = t_ci(data_sim[i], alpha)
coverages.append(lo <= mu_true <= hi)
widths.append(hi - lo)
print(f'{n:>6} {np.mean(coverages):>10.4f} {0.95:>10.4f} {np.mean(widths):>10.4f}')
print('Coverage ≈ 0.95 for all n — exact CI works for any sample size')
Code cell 24
# === 6.2 Bootstrap Confidence Intervals ===
np.random.seed(42)
def bootstrap_ci(data, stat_fn, B=2000, alpha=0.05):
"""Non-parametric bootstrap CI using the percentile method."""
n = len(data)
boot_stats = np.array([stat_fn(data[np.random.randint(0, n, n)]) for _ in range(B)])
lo = np.percentile(boot_stats, 100 * alpha/2)
hi = np.percentile(boot_stats, 100 * (1 - alpha/2))
return lo, hi, boot_stats
# Bootstrap CI for the median (asymptotic CI is harder to compute)
n_bs = 30
sample = np.random.normal(mu_true, sigma_true, n_bs)
# t-CI for mean
ci_mean_lo, ci_mean_hi = t_ci(sample)
# Bootstrap CI for median
ci_med_lo, ci_med_hi, boot_medians = bootstrap_ci(sample, np.median, B=2000)
print(f'Sample (n={n_bs}): mean={sample.mean():.3f}, median={np.median(sample):.3f}')
print(f'True mu = {mu_true}')
print()
print(f't-CI for mean: [{ci_mean_lo:.3f}, {ci_mean_hi:.3f}] (width={ci_mean_hi-ci_mean_lo:.3f})')
print(f'Bootstrap CI for median: [{ci_med_lo:.3f}, {ci_med_hi:.3f}] (width={ci_med_hi-ci_med_lo:.3f})')
print(f'True mu in bootstrap CI for median: {ci_med_lo <= mu_true <= ci_med_hi}')
Code cell 25
# === 6.3 Visualise Bootstrap Distribution ===
try:
import matplotlib.pyplot as plt
COLORS = {'primary':'#0077BB','secondary':'#EE7733','tertiary':'#009988','error':'#CC3311','neutral':'#555555','highlight':'#EE3377'}
HAS_MPL = True
except ImportError:
HAS_MPL = False
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(13, 5))
# Left: Bootstrap distribution of the median
axes[0].hist(boot_medians, bins=50, density=True, alpha=0.7,
color=COLORS['primary'], edgecolor='white', label='Bootstrap medians')
axes[0].axvline(np.median(sample), color=COLORS['error'], lw=2, label='Observed median')
axes[0].axvline(mu_true, color=COLORS['neutral'], lw=1.5, linestyle='--', label='True $\\mu$')
axes[0].axvspan(ci_med_lo, ci_med_hi, alpha=0.15, color=COLORS['tertiary'], label='95% bootstrap CI')
axes[0].set_title('Bootstrap distribution of sample median')
axes[0].set_xlabel('Sample median')
axes[0].set_ylabel('Density')
axes[0].legend(fontsize=9)
# Right: 20 simulated t-CIs showing coverage
np.random.seed(123)
M_show = 20
sim_data = np.random.normal(mu_true, sigma_true, size=(M_show, n_bs))
axes[1].axvline(mu_true, color=COLORS['neutral'], lw=1.5, linestyle='--', label='True $\\mu$')
for i in range(M_show):
lo, hi = t_ci(sim_data[i])
covered = lo <= mu_true <= hi
color = COLORS['primary'] if covered else COLORS['error']
axes[1].plot([lo, hi], [i, i], color=color, lw=2, alpha=0.8)
axes[1].scatter([sim_data[i].mean()], [i], color=color, s=30)
axes[1].set_title(f'20 simulated 95% t-CIs (red = misses true $\\mu$)')
axes[1].set_xlabel('$\\mu$')
axes[1].set_ylabel('Experiment')
fig.tight_layout()
plt.show()
7. Method of Moments
We compute MoM estimators for the Gamma and Beta distributions and compare to MLE.
Code cell 27
# === 7.1 Method of Moments vs MLE ===
np.random.seed(42)
n = 200
# --- Gamma(alpha, beta) ---
# Mean: alpha/beta, Variance: alpha/beta^2
alpha_true, beta_true = 3.0, 2.0
x_gamma = np.random.gamma(alpha_true, 1/beta_true, n) # scipy uses scale=1/beta
# MoM
xbar_g = x_gamma.mean()
s2_g = x_gamma.var()
beta_mom = xbar_g / s2_g
alpha_mom = xbar_g * beta_mom
# MLE (numerical)
def neg_loglik_gamma(params, data):
a, b = params
if a <= 0 or b <= 0:
return 1e10
return -(n * a * np.log(b) - n * gammaln(a) + (a-1)*np.sum(np.log(data)) - b*np.sum(data))
result_g = optimize.minimize(neg_loglik_gamma, [alpha_mom, beta_mom],
args=(x_gamma,), method='L-BFGS-B',
bounds=[(1e-6, None), (1e-6, None)])
alpha_mle, beta_mle = result_g.x
print('Gamma distribution estimation:')
print(f' True: alpha={alpha_true}, beta={beta_true}')
print(f' MoM: alpha={alpha_mom:.4f}, beta={beta_mom:.4f}')
print(f' MLE: alpha={alpha_mle:.4f}, beta={beta_mle:.4f}')
print()
# --- Beta(alpha, beta) ---
a_true, b_true = 2.0, 5.0
x_beta = np.random.beta(a_true, b_true, n)
m_b = x_beta.mean()
v_b = x_beta.var()
common = m_b*(1-m_b)/v_b - 1
a_mom = m_b * common
b_mom = (1 - m_b) * common
print('Beta distribution estimation:')
print(f' True: alpha={a_true}, beta={b_true}')
print(f' MoM: alpha={a_mom:.4f}, beta={b_mom:.4f}')
ok = np.isclose(a_mom, a_true, rtol=0.15) and np.isclose(b_mom, b_true, rtol=0.15)
print(f'PASS MoM estimates within 15% of true values: {ok}')
8. Fisher Information in Machine Learning
8.1 Natural Gradient vs. Vanilla Gradient
The natural gradient is parameterisation-invariant. We demonstrate this advantage on a Bernoulli estimation problem.
Code cell 29
# === 8.1 Natural Gradient vs Standard Gradient for Bernoulli MLE ===
# We maximise the Bernoulli log-likelihood with both methods.
# Standard gradient: dp/dt = dL/dp
# Natural gradient: dp/dt = I(p)^{-1} * dL/dp = p(1-p) * dL/dp
np.random.seed(42)
p_true = 0.7
n_data = 50
data = np.random.binomial(1, p_true, n_data)
S = data.sum()
def log_lik_bernoulli(p):
p = np.clip(p, 1e-10, 1-1e-10)
return S * np.log(p) + (n_data - S) * np.log(1 - p)
def score_bernoulli(p):
p = np.clip(p, 1e-10, 1-1e-10)
return S/p - (n_data - S)/(1-p)
def fisher_bernoulli(p):
p = np.clip(p, 1e-10, 1-1e-10)
return n_data / (p * (1-p))
# Standard gradient ascent
eta = 0.001
p_std = 0.5
path_std = [p_std]
for _ in range(200):
p_std = np.clip(p_std + eta * score_bernoulli(p_std), 1e-6, 1-1e-6)
path_std.append(p_std)
# Natural gradient ascent (eta_nat much larger -- needs fewer steps)
eta_nat = 0.1
p_nat = 0.5
path_nat = [p_nat]
for _ in range(200):
nat_grad = score_bernoulli(p_nat) / fisher_bernoulli(p_nat)
p_nat = np.clip(p_nat + eta_nat * nat_grad, 1e-6, 1-1e-6)
path_nat.append(p_nat)
p_mle = S / n_data
print(f'True p = {p_true}, MLE = {p_mle:.4f}')
print(f'Standard gradient (final): p = {path_std[-1]:.4f} after 200 steps')
print(f'Natural gradient (final): p = {path_nat[-1]:.4f} after 200 steps')
steps_to_conv_nat = next((i for i,v in enumerate(path_nat) if abs(v-p_mle)<0.01), 200)
steps_to_conv_std = next((i for i,v in enumerate(path_std) if abs(v-p_mle)<0.01), 200)
print(f'Steps to convergence (tol=0.01): standard={steps_to_conv_std}, natural={steps_to_conv_nat}')
print('Natural gradient converges faster by exploiting the geometry of the likelihood')
Code cell 30
# === 8.2 Visualise Gradient Convergence ===
try:
import matplotlib.pyplot as plt
COLORS = {'primary':'#0077BB','secondary':'#EE7733','tertiary':'#009988','error':'#CC3311','neutral':'#555555'}
HAS_MPL = True
except ImportError:
HAS_MPL = False
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(13, 5))
steps = np.arange(len(path_std))
axes[0].plot(steps, path_std, color=COLORS['primary'], lw=2, label='Standard gradient')
axes[0].plot(steps, path_nat, color=COLORS['secondary'], lw=2, label='Natural gradient')
axes[0].axhline(p_mle, color=COLORS['error'], lw=1.5, linestyle='--', label=f'MLE = {p_mle:.2f}')
axes[0].set_title('Convergence to MLE: standard vs. natural gradient')
axes[0].set_xlabel('Iteration')
axes[0].set_ylabel('$p$')
axes[0].legend()
# Plot the log-likelihood surface
p_grid = np.linspace(0.05, 0.95, 200)
ll = np.array([log_lik_bernoulli(p) for p in p_grid])
axes[1].plot(p_grid, ll, color=COLORS['primary'], lw=2)
axes[1].scatter(path_std[::20], [log_lik_bernoulli(p) for p in path_std[::20]],
color=COLORS['primary'], s=40, alpha=0.6, label='Standard path')
axes[1].scatter(path_nat[::5], [log_lik_bernoulli(p) for p in path_nat[::5]],
color=COLORS['secondary'], s=60, alpha=0.8, label='Natural path')
axes[1].axvline(p_mle, color=COLORS['error'], lw=1.5, linestyle='--', label='MLE')
axes[1].set_title('Optimisation path on log-likelihood surface')
axes[1].set_xlabel('$p$')
axes[1].set_ylabel('$\\ell(p)$')
axes[1].legend(fontsize=9)
fig.tight_layout()
plt.show()
9. Fisher Information Matrix for a Neural Network Layer
For a softmax layer , the FIM has a Kronecker product structure: where and .
Code cell 32
# === 9.1 Fisher Information Matrix for Softmax Layer ===
np.random.seed(42)
d = 4 # input dim
K = 3 # num classes
n_fim = 500 # number of samples
# Random weight matrix and input distribution
W = np.random.randn(K, d) * 0.5
X_fim = np.random.randn(n_fim, d)
def softmax(z):
e = np.exp(z - z.max(axis=-1, keepdims=True))
return e / e.sum(axis=-1, keepdims=True)
# Compute scores = W @ x for each sample
logits = X_fim @ W.T # (n, K)
P = softmax(logits) # (n, K)
# Full FIM (K*d x K*d) via outer product of score vectors
# Score for weight W: d log p(y|x;W)/d W = (e_y - p_hat) x^T
# FIM = E[(e_y - p_hat) x^T (vectorized)][(e_y - p_hat) x^T]^T
# Approximate FIM by sampling y from p_hat
FIM_full = np.zeros((K*d, K*d))
for i in range(n_fim):
p_i = P[i] # (K,)
x_i = X_fim[i] # (d,)
# For each possible y, contribution weighted by p_hat_k
for k in range(K):
e_k = np.zeros(K)
e_k[k] = 1.0
delta = e_k - p_i # (K,)
grad_vec = np.outer(delta, x_i).reshape(-1) # K*d
FIM_full += p_i[k] * np.outer(grad_vec, grad_vec)
FIM_full /= n_fim
# K-FAC approximation: FIM ≈ G ⊗ A
A = (X_fim.T @ X_fim) / n_fim # (d, d) input covariance
G = np.zeros((K, K))
for i in range(n_fim):
p_i = P[i]
G += np.diag(p_i) - np.outer(p_i, p_i)
G /= n_fim
FIM_kfac = np.kron(G, A) # K-FAC approximation
# Compare
frob_full = np.linalg.norm(FIM_full)
frob_err = np.linalg.norm(FIM_full - FIM_kfac)
rel_err = frob_err / frob_full
print(f'Full FIM shape: {FIM_full.shape}')
print(f'Frobenius norm of full FIM: {frob_full:.4f}')
print(f'Frobenius error of K-FAC approx: {frob_err:.4f}')
print(f'Relative error: {rel_err:.4f} ({100*rel_err:.1f}%)')
print(f'K-FAC approximation captures the Kronecker structure of the FIM')
10. Scaling Laws via Nonlinear MLE
The Chinchilla scaling law (Hoffmann et al., 2022) models language model loss as:
We simulate this estimation problem and compute CIs for the exponents.
Code cell 34
# === 10.1 Scaling Law Estimation via MLE ===
from scipy.optimize import curve_fit
np.random.seed(42)
# True Chinchilla-like parameters
E_true = 1.69
A_true = 406.4
B_true = 410.7
alpha_true = 0.34
beta_true = 0.28
def scaling_law(ND, E, A, B, alpha, beta):
N, D = ND
return E + A / N**alpha + B / D**beta
# Generate synthetic (N, D) pairs spanning orders of magnitude
N_grid = np.logspace(8, 12, 6) # 10^8 to 10^12 parameters
D_grid = np.logspace(9, 13, 6) # 10^9 to 10^13 tokens
NN, DD = np.meshgrid(N_grid, D_grid)
N_flat = NN.flatten()
D_flat = DD.flatten()
# True losses with noise
L_true = scaling_law((N_flat, D_flat), E_true, A_true, B_true, alpha_true, beta_true)
noise = 0.01 * np.random.randn(len(L_true))
L_obs = L_true + noise
# Fit via nonlinear least squares (= MLE under Gaussian noise assumption)
p0 = [1.7, 400, 400, 0.3, 0.3] # initial guess
bounds = ([0, 0, 0, 0.1, 0.1], [5, 1e4, 1e4, 1.0, 1.0])
popt, pcov = curve_fit(scaling_law, (N_flat, D_flat), L_obs,
p0=p0, bounds=bounds, maxfev=10000)
perr = np.sqrt(np.diag(pcov)) # standard errors
print('Scaling law parameter estimation:')
names = ['E', 'A', 'B', 'alpha', 'beta']
true_vals = [E_true, A_true, B_true, alpha_true, beta_true]
print(f'{"Param":<8} {"True":>8} {"MLE":>8} {"SE":>8} {"95% CI":>20}')
print('-' * 56)
for name, true, est, se in zip(names, true_vals, popt, perr):
ci = f'[{est-1.96*se:.3f}, {est+1.96*se:.3f}]'
print(f'{name:<8} {true:>8.3f} {est:>8.3f} {se:>8.4f} {ci:>20}')
# Optimal allocation for fixed compute C
E_fit, A_fit, B_fit, alpha_fit, beta_fit = popt
C_budget = 6e20 # 6 * 10^20 FLOPs
N_candidates = np.logspace(8, 12, 1000)
D_candidates = C_budget / (6 * N_candidates)
L_candidates = scaling_law((N_candidates, D_candidates), *popt)
N_opt = N_candidates[np.argmin(L_candidates)]
D_opt = D_candidates[np.argmin(L_candidates)]
print(f'\nOptimal allocation for C=6e20 FLOPs:')
print(f' N* = {N_opt:.2e} parameters, D* = {D_opt:.2e} tokens')
print(f' Optimal loss: {L_candidates.min():.4f}')
11. Elastic Weight Consolidation (EWC)
EWC (Kirkpatrick et al., 2017) uses the diagonal FIM to penalise changes to important weights:
We simulate catastrophic forgetting and EWC on a simple linear regression problem.
Code cell 36
# === 11.1 EWC Simulation for Continual Learning ===
np.random.seed(42)
# Task 1: learn y = 2*x1 + 1*x2
n_t = 100
X1 = np.random.randn(n_t, 2)
w_true_1 = np.array([2.0, 1.0])
y1 = X1 @ w_true_1 + 0.1 * np.random.randn(n_t)
# Task 2: learn y = 0.5*x1 + 3*x2
X2 = np.random.randn(n_t, 2)
w_true_2 = np.array([0.5, 3.0])
y2 = X2 @ w_true_2 + 0.1 * np.random.randn(n_t)
# Fit Task 1: OLS
w1_hat = np.linalg.lstsq(X1, y1, rcond=None)[0]
# Fisher information (diagonal) for Task 1 = X1^T X1 / n (Gaussian regression)
F_diag = np.diag(X1.T @ X1) / n_t
# Naive fine-tuning on Task 2 (forgets Task 1)
w2_naive = np.linalg.lstsq(X2, y2, rcond=None)[0]
# EWC fine-tuning: add FIM penalty
lambda_ewc = 10.0
def ewc_loss_and_grad(w, X2, y2, w1_star, F, lam):
"""Task 2 loss + EWC penalty"""
residuals = y2 - X2 @ w
task2_loss = np.sum(residuals**2) / len(y2)
ewc_penalty = 0.5 * lam * np.sum(F * (w - w1_star)**2)
grad = -2 * X2.T @ residuals / len(y2) + lam * F * (w - w1_star)
return task2_loss + ewc_penalty, grad
# Gradient descent with EWC
w_ewc = w1_hat.copy()
for _ in range(2000):
loss, grad = ewc_loss_and_grad(w_ewc, X2, y2, w1_hat, F_diag, lambda_ewc)
w_ewc -= 0.01 * grad
# Evaluate on both tasks
def task_mse(w, X, y):
return np.mean((y - X @ w)**2)
print('Continual learning comparison:')
print(f'True Task 1 weights: {w_true_1}')
print(f'True Task 2 weights: {w_true_2}')
print()
print(f'{"Method":<20} {"Task 1 MSE":>12} {"Task 2 MSE":>12}')
print('-' * 46)
print(f'{"After Task 1 only":<20} {task_mse(w1_hat, X1, y1):>12.4f} {"N/A":>12}')
print(f'{"Naive fine-tune":<20} {task_mse(w2_naive, X1, y1):>12.4f} {task_mse(w2_naive, X2, y2):>12.4f}')
print(f'{"EWC fine-tune":<20} {task_mse(w_ewc, X1, y1):>12.4f} {task_mse(w_ewc, X2, y2):>12.4f}')
print(f'\nFisher diagonal F = {F_diag}')
print(f'EWC protects weight j={np.argmax(F_diag)} (higher FIM) most strongly')
Summary
This notebook has demonstrated the key results of estimation theory:
| Concept | Key Result | ML Connection |
|---|---|---|
| Sampling distribution | Estimators are random variables | Training randomness |
| Bias-variance decomposition | MSE = Bias² + Var | Regularisation trade-off |
| Fisher information | Natural gradient, K-FAC | |
| Cramér-Rao bound | Efficiency limits | |
| MLE = Cross-entropy | NLL | All DL training |
| Asymptotic normality | Error bars | |
| Bootstrap CI | Resampling without assumptions | Model evaluation |
| Natural gradient | K-FAC, Shampoo | |
| Scaling law MLE | Nonlinear least squares | Chinchilla |
| EWC | Diagonal FIM penalty | Continual learning |
Next: Hypothesis Testing — using estimators to make formal decisions about data.
12. Sufficient Statistics
A sufficient statistic captures all information about in the data. We demonstrate this for the Bernoulli model: knowing (total successes) is equivalent to knowing the full dataset for estimating .
Code cell 39
# === 12.1 Sufficient Statistic for Bernoulli ===
import numpy as np
np.random.seed(42)
# For Bernoulli(p), the sufficient statistic is T = sum(x)
# Rao-Blackwell: any estimator conditioned on T has lower variance
p_true = 0.4
n = 20
M = 10000
data = np.random.binomial(1, p_true, size=(M, n))
T = data.sum(axis=1) # sufficient statistic
# Estimator 1: based on full data (sample mean)
est1 = data.mean(axis=1)
# Estimator 2: based only on first two observations
est2 = data[:, :2].mean(axis=1)
# Rao-Blackwell improvement of est2: E[est2 | T]
# E[x1+x2 | T=t] = 2 * E[x1 | T=t] = 2 * t * P(x1=1|T=t) = 2 * t/n
# So E[est2|T] = t/n = sample mean
est2_rb = T / n # Rao-Blackwell estimator
print(f'Bernoulli(p={p_true}), n={n}')
print(f'Estimator | E[est] | Var(est) | MSE')
print('-' * 58)
for name, est in [('Sample mean (est1)', est1),
('First 2 obs (est2)', est2),
('RB improvement (est2_rb)', est2_rb)]:
bias = est.mean() - p_true
var = est.var()
mse = ((est - p_true)**2).mean()
print(f'{name:<28} {est.mean():.5f} {var:.7f} {mse:.7f}')
print()
ok = np.isclose(est1.var(), est2_rb.var(), atol=1e-4)
print(f'PASS Rao-Blackwell: Var(est2_rb) ≈ Var(est1): {ok}')
print('Conditioning on T cannot increase variance (Rao-Blackwell theorem)')
13. Consistency: Convergence to Truth
An estimator is consistent if as . We show this graphically for three estimators.
Code cell 41
# === 13.1 Consistency vs Inconsistency ===
import numpy as np
np.random.seed(42)
theta_true = 3.0
sigma = 1.0
n_max = 1000
n_values = np.arange(1, n_max+1)
M = 500 # trajectories to average
# Generate a large dataset
data = np.random.normal(theta_true, sigma, size=(M, n_max))
# Three estimators
# 1. Sample mean (consistent)
cumsum = data.cumsum(axis=1)
xbar = cumsum / n_values[np.newaxis, :] # (M, n_max)
# 2. First observation only (inconsistent)
x_first = np.tile(data[:, 0:1], (1, n_max)) # (M, n_max) constant
# 3. MLE of variance (biased but consistent)
xbar2 = xbar**2
x2bar = (data**2).cumsum(axis=1) / n_values[np.newaxis, :]
var_mle = x2bar - xbar**2 # biased MLE variance (mean is theta, var is sigma^2=1)
# Compute P(|est - true| > epsilon) at each n
eps = 0.2
prob_xbar = (np.abs(xbar - theta_true) > eps).mean(axis=0)
prob_first = (np.abs(x_first - theta_true) > eps).mean(axis=0)
prob_var = (np.abs(var_mle - sigma**2) > eps).mean(axis=0)
try:
import matplotlib.pyplot as plt
COLORS = {'primary':'#0077BB','secondary':'#EE7733','tertiary':'#009988','error':'#CC3311','neutral':'#555555'}
HAS_MPL = True
except ImportError:
HAS_MPL = False
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(13, 5))
# Sample paths
n_show = 5
for i in range(n_show):
axes[0].plot(n_values, xbar[i], alpha=0.4, color=COLORS['primary'], lw=0.8)
axes[0].plot(n_values, x_first[i, :], alpha=0.4, color=COLORS['secondary'], lw=0.8)
axes[0].axhline(theta_true, color=COLORS['error'], lw=2, linestyle='--', label=f'True $\\theta^*={theta_true}$')
axes[0].plot([], [], color=COLORS['primary'], lw=1.5, label='Sample mean (consistent)')
axes[0].plot([], [], color=COLORS['secondary'], lw=1.5, label='First obs (inconsistent)')
axes[0].set_title('Estimator paths for 5 simulations')
axes[0].set_xlabel('Sample size $n$')
axes[0].set_ylabel('Estimate')
axes[0].legend(fontsize=9)
axes[0].set_ylim(0, 6)
# Convergence in probability
axes[1].plot(n_values, prob_xbar, color=COLORS['primary'], lw=2,
label='$P(|\\bar{x}_n - \\theta^*| > 0.2)$')
axes[1].plot(n_values, prob_first, color=COLORS['secondary'], lw=2,
label='$P(|x^{(1)} - \\theta^*| > 0.2)$')
axes[1].plot(n_values, prob_var, color=COLORS['tertiary'], lw=2,
label='$P(|\\hat{\\sigma}^2_{MLE} - \\sigma^2| > 0.2)$')
axes[1].set_title('Convergence in probability')
axes[1].set_xlabel('Sample size $n$')
axes[1].set_ylabel('$P(|\\hat\\theta_n - \\theta^*| > \\varepsilon)$')
axes[1].legend(fontsize=8)
fig.tight_layout()
plt.show()
print(f'At n=1000, P(|xbar - theta| > 0.2) = {prob_xbar[-1]:.4f}')
print(f'At n=1000, P(|first obs - theta| > 0.2) = {prob_first[-1]:.4f}')
print('Sample mean is consistent; first observation is not')
14. Confidence Interval Coverage Verification
We verify that 95% CIs achieve 95% coverage in repeated experiments.
Code cell 43
# === 14.1 CI Coverage Verification for Multiple CI Types ===
import numpy as np
from scipy import stats
np.random.seed(42)
p_binom = 0.3
n = 50
M = 10000
alpha = 0.05
z = stats.norm.ppf(1 - alpha/2)
data = np.random.binomial(1, p_binom, size=(M, n))
p_hat = data.mean(axis=1)
# 1. Wald CI
se_wald = np.sqrt(p_hat * (1 - p_hat) / n)
wald_lo = p_hat - z * se_wald
wald_hi = p_hat + z * se_wald
# 2. Wilson score CI
denom = 1 + z**2 / n
centre = (p_hat + z**2 / (2*n)) / denom
margin = z * np.sqrt(p_hat*(1-p_hat)/n + z**2/(4*n**2)) / denom
wilson_lo = centre - margin
wilson_hi = centre + margin
# 3. Exact Clopper-Pearson
k = (data.sum(axis=1)).astype(int)
cp_lo = stats.beta.ppf(alpha/2, k, n-k+1)
cp_hi = stats.beta.ppf(1-alpha/2, k+1, n-k)
cp_lo = np.where(k == 0, 0.0, cp_lo)
cp_hi = np.where(k == n, 1.0, cp_hi)
# Compute coverage
wald_cov = np.mean((wald_lo <= p_binom) & (p_binom <= wald_hi))
wilson_cov = np.mean((wilson_lo <= p_binom) & (p_binom <= wilson_hi))
cp_cov = np.mean((cp_lo <= p_binom) & (p_binom <= cp_hi))
wald_w = np.mean(wald_hi - wald_lo)
wilson_w = np.mean(wilson_hi - wilson_lo)
cp_w = np.mean(cp_hi - cp_lo)
print(f'Coverage verification for p={p_binom}, n={n}, alpha={alpha}')
print(f'{"CI type":<20} {"Coverage":>10} {"Avg Width":>10}')
print('-' * 44)
print(f'{"Wald":<20} {wald_cov:>10.4f} {wald_w:>10.4f}')
print(f'{"Wilson score":<20} {wilson_cov:>10.4f} {wilson_w:>10.4f}')
print(f'{"Clopper-Pearson":<20} {cp_cov:>10.4f} {cp_w:>10.4f}')
print(f'Target coverage: {1-alpha:.2f}')
print(f'\nWald undercoverage = {1-alpha - wald_cov:.4f}')
print(f'Wilson overshoot = {wilson_cov - (1-alpha):.4f}')
print('Wilson ≈ nominal for all p; Wald fails near 0 and 1')
15. Multivariate Gaussian MLE
We verify the MLE formulas: and .
Code cell 45
# === 15.1 Multivariate Gaussian MLE ===
import numpy as np
np.random.seed(42)
d = 3
n = 500
mu_true = np.array([1.0, -2.0, 3.0])
# Positive definite covariance matrix
A = np.random.randn(d, d)
Sigma_true = A @ A.T / d + 0.5 * np.eye(d)
# Generate data
X = np.random.multivariate_normal(mu_true, Sigma_true, n)
# MLE estimates
mu_hat = X.mean(axis=0)
Sigma_hat_mle = (X - mu_hat).T @ (X - mu_hat) / n # biased (1/n)
Sigma_hat_unb = (X - mu_hat).T @ (X - mu_hat) / (n-1) # unbiased (1/(n-1))
print(f'Multivariate Gaussian MLE (n={n}, d={d})')
print(f'True mu: {mu_true}')
print(f'MLE mu: {mu_hat.round(4)}')
print(f'Error: {np.linalg.norm(mu_hat - mu_true):.6f}')
print()
print('True Sigma (diagonal):', np.diag(Sigma_true).round(4))
print('MLE Sigma (diagonal):', np.diag(Sigma_hat_mle).round(4), '(biased)')
print('Unb Sigma (diagonal):', np.diag(Sigma_hat_unb).round(4), '(unbiased)')
print()
# Verify: MLE bias = -Sigma/n (elementwise approx)
bias = Sigma_hat_mle - Sigma_true
expected_bias = -Sigma_true / n
print('Bias verification (diagonal):')
print(f' Empirical bias: {np.diag(bias).round(6)}')
print(f' Expected -Sigma/n: {np.diag(expected_bias).round(6)}')
ok = np.allclose(np.diag(bias), np.diag(expected_bias), atol=0.05)
print(f'PASS bias ≈ -Sigma/n: {ok}')
# Positive definiteness
eigs = np.linalg.eigvalsh(Sigma_hat_mle)
print(f'\nMLE Sigma eigenvalues (all > 0?): {eigs.round(4)}')
print(f'Positive definite: {np.all(eigs > 0)}')
16. Model Misspecification
When the model family does not contain the true distribution, MLE converges to the KL-minimising parameter .
Code cell 47
# === 16.1 MLE Under Misspecification ===
import numpy as np
from scipy import stats, optimize
np.random.seed(42)
# True distribution: mixture of two Gaussians
# p*(x) = 0.4 * N(0, 1) + 0.6 * N(4, 1)
# We fit a single Gaussian N(mu, sigma^2) -- misspecified model
# True moments of p*
# E[X] = 0.4*0 + 0.6*4 = 2.4
# E[X^2] = 0.4*(0+1) + 0.6*(16+1) = 0.4 + 10.2 = 10.6
# Var = 10.6 - 2.4^2 = 10.6 - 5.76 = 4.84
mu_star_theory = 2.4
var_star_theory = 4.84
# Sample from true mixture
n = 2000
component = np.random.binomial(1, 0.6, n)
x_mix = np.where(component, np.random.normal(4, 1, n), np.random.normal(0, 1, n))
# MLE of Gaussian parameters on misspecified data
mu_mle_mis = x_mix.mean()
sigma2_mle_mis = x_mix.var()
print('MLE under misspecification:')
print(f'True model: 0.4*N(0,1) + 0.6*N(4,1)')
print(f'Fitted model: N(mu, sigma^2) [MISSPECIFIED]')
print()
print(f'MLE mu = {mu_mle_mis:.4f} (KL-minimising mu = {mu_star_theory})')
print(f'MLE sigma^2 = {sigma2_mle_mis:.4f} (KL-minimising var = {var_star_theory})')
ok = np.isclose(mu_mle_mis, mu_star_theory, atol=0.15)
print(f'PASS MLE converges to KL-minimising mu: {ok}')
try:
import matplotlib.pyplot as plt
COLORS = {'primary':'#0077BB','secondary':'#EE7733','tertiary':'#009988','error':'#CC3311'}
HAS_MPL = True
except ImportError:
HAS_MPL = False
if HAS_MPL:
fig, ax = plt.subplots(figsize=(10, 5))
x_range = np.linspace(-4, 9, 300)
# True mixture
p_true = 0.4 * stats.norm.pdf(x_range, 0, 1) + 0.6 * stats.norm.pdf(x_range, 4, 1)
# Fitted Gaussian
p_fit = stats.norm.pdf(x_range, mu_mle_mis, np.sqrt(sigma2_mle_mis))
ax.plot(x_range, p_true, color=COLORS['primary'], lw=2.5, label='True $p^*$ (mixture)')
ax.plot(x_range, p_fit, color=COLORS['error'], lw=2.5, linestyle='--',
label=f'Fitted Gaussian $\\mathcal{{N}}({mu_mle_mis:.2f}, {sigma2_mle_mis:.2f})$')
ax.hist(x_mix, bins=60, density=True, alpha=0.3, color=COLORS['primary'])
ax.set_title('MLE under misspecification: best Gaussian fit to a mixture')
ax.set_xlabel('$x$')
ax.set_ylabel('Density')
ax.legend()
fig.tight_layout()
plt.show()
print('The fitted Gaussian is the KL-minimising single Gaussian')
Code cell 48
# === 16.2 MoM vs MLE Efficiency Comparison ===
import numpy as np
np.random.seed(42)
# Compare MoM and MLE for Gamma(alpha=2, beta=1.5)
from scipy.special import gammaln
from scipy import optimize
alpha_t, beta_t = 2.0, 1.5
M_comp = 2000
n = 100
alpha_mom_list, beta_mom_list = [], []
alpha_mle_list, beta_mle_list = [], []
def neg_loglik_gamma_fast(params, data):
a, b = params
if a <= 0 or b <= 0:
return 1e10
n_d = len(data)
return -(n_d*a*np.log(b) - n_d*gammaln(a) + (a-1)*np.sum(np.log(data+1e-15)) - b*np.sum(data))
for _ in range(M_comp):
x = np.random.gamma(alpha_t, 1/beta_t, n)
xb = x.mean()
s2 = x.var()
beta_m = xb / s2
alpha_m = xb * beta_m
alpha_mom_list.append(alpha_m)
beta_mom_list.append(beta_m)
try:
res = optimize.minimize(neg_loglik_gamma_fast, [alpha_m, beta_m], args=(x,),
method='L-BFGS-B', bounds=[(0.01, None), (0.01, None)])
alpha_mle_list.append(res.x[0])
beta_mle_list.append(res.x[1])
except Exception:
alpha_mle_list.append(np.nan)
beta_mle_list.append(np.nan)
a_mom = np.array(alpha_mom_list)
a_mle = np.array(alpha_mle_list)
print(f'Gamma(alpha={alpha_t}, beta={beta_t}), n={n}, M={M_comp} simulations')
print(f'{"Method":<10} {"E[alpha_hat]":>14} {"Var(alpha_hat)":>16} {"MSE":>10}')
print('-' * 54)
for name, est in [("MoM", a_mom), ("MLE", a_mle[~np.isnan(a_mle)])]:
bias = est.mean() - alpha_t
var = est.var()
mse = ((est - alpha_t)**2).mean()
print(f'{name:<10} {est.mean():>14.5f} {var:>16.7f} {mse:>10.7f}')
eff = a_mom.var() / a_mle[~np.isnan(a_mle)].var()
print(f'\nRelative efficiency MLE/MoM = {eff:.3f}')
print(f'MLE is {(eff-1)*100:.1f}% more efficient than MoM for alpha')
Code cell 49
# === 16.3 Bootstrap CI for Complex Statistics (AUC) ===
import numpy as np
from scipy import stats
np.random.seed(42)
# Simulate a binary classifier's outputs
n_pos, n_neg = 80, 120
scores_pos = np.random.normal(0.7, 0.2, n_pos) # scores for positive class
scores_neg = np.random.normal(0.4, 0.25, n_neg) # scores for negative class
y_true = np.concatenate([np.ones(n_pos), np.zeros(n_neg)])
y_scores = np.concatenate([scores_pos, scores_neg])
def compute_auc(y_true, y_scores):
"""Mann-Whitney U statistic = AUC."""
pos_scores = y_scores[y_true == 1]
neg_scores = y_scores[y_true == 0]
n_p, n_n = len(pos_scores), len(neg_scores)
wins = sum(1 for p in pos_scores for n in neg_scores if p > n)
ties = sum(0.5 for p in pos_scores for n in neg_scores if p == n)
return (wins + ties) / (n_p * n_n)
# Use scipy for speed
from scipy.stats import mannwhitneyu
U, _ = mannwhitneyu(y_scores[y_true==1], y_scores[y_true==0], alternative='greater')
auc_obs = U / (n_pos * n_neg)
# Bootstrap CI
B = 2000
n_total = len(y_true)
boot_aucs = []
for _ in range(B):
idx = np.random.randint(0, n_total, n_total)
y_b, s_b = y_true[idx], y_scores[idx]
if y_b.sum() > 0 and y_b.sum() < n_total:
U_b, _ = mannwhitneyu(s_b[y_b==1], s_b[y_b==0], alternative='greater')
boot_aucs.append(U_b / (y_b.sum() * (n_total - y_b.sum())))
boot_aucs = np.array(boot_aucs)
ci_lo, ci_hi = np.percentile(boot_aucs, [2.5, 97.5])
print(f'Binary classifier AUC estimation:')
print(f' n_pos={n_pos}, n_neg={n_neg}')
print(f' Observed AUC = {auc_obs:.4f}')
print(f' Bootstrap SE = {boot_aucs.std():.4f}')
print(f' 95% bootstrap CI = [{ci_lo:.4f}, {ci_hi:.4f}]')
print(f' CI width = {ci_hi-ci_lo:.4f}')
print('\nBootstrap CI for AUC requires no distributional assumption -- ideal for ML evaluation')
End of Theory Notebook
You have now worked through all the key results of estimation theory interactively.
Proceed to: exercises.ipynb to test your understanding with 8 graded problems from basic MLEs through FIM for neural network layers.
Reference: notes.md for the full theoretical treatment.
Next section: 03-Hypothesis-Testing
Code cell 51
# === Summary: Key Numerical Verification ===
import numpy as np
np.random.seed(42)
print('=' * 60)
print('ESTIMATION THEORY — KEY RESULTS VERIFIED')
print('=' * 60)
n = 1000
p_v = 0.35
x_v = np.random.binomial(1, p_v, n)
p_hat_v = x_v.mean()
# 1. MLE correct
print(f'1. Bernoulli MLE: p_hat={p_hat_v:.4f}, true={p_v} -> err={abs(p_hat_v-p_v):.4f}')
# 2. CRB achieved
crb_v = p_v*(1-p_v)/n
from scipy import stats as st
M_v = 5000
emp_var_v = np.random.binomial(1, p_v, size=(M_v, n)).mean(axis=1).var()
print(f'2. CRB={crb_v:.6f}, empirical Var={emp_var_v:.6f} -> ratio={emp_var_v/crb_v:.4f} (should≈1)')
# 3. Asymptotic normality
p_hats_v = np.random.binomial(1, p_v, size=(M_v, n)).mean(axis=1)
z_v = (p_hats_v - p_v) / np.sqrt(p_v*(1-p_v)/n)
ks_v = st.kstest(z_v, 'norm').pvalue
print(f'3. KS test for normality of standardised MLE: p-value={ks_v:.4f} (>0.05 = normal)')
# 4. 95% CI coverage
z95 = st.norm.ppf(0.975)
se_v = np.sqrt(p_hats_v*(1-p_hats_v)/n)
cov_v = np.mean((p_hats_v - z95*se_v <= p_v) & (p_v <= p_hats_v + z95*se_v))
print(f'4. Wald CI coverage: {cov_v:.4f} (target 0.95)')
print('=' * 60)
print('All key results verified numerically.')