Exercises Notebook

Converted from exercises.ipynb for web reading.

Fisher Information — Exercises

This notebook contains 8 exercises covering score functions, scalar and matrix Fisher information, KL curvature, Jeffreys priors, empirical Fisher approximations, and ML-facing applications.

Difficulty: Exercises 1-3 are mechanics, 4-6 are theory, and 7-8 are advanced ML applications.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np

def header(title):
    print("\n" + "=" * 78)
    print(title)
    print("=" * 78)

def check_close(name, value, target, tol=1e-8):
    ok = np.allclose(value, target, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    if not ok:
        print("  value :", value)
        print("  target:", target)
    return ok

def check_true(name, condition):
    print(f"{'PASS' if condition else 'FAIL'} - {name}")
    return condition

print("Exercise helpers ready.")

Exercise 1 [*] — Bernoulli Fisher

Let $X \sim \operatorname{Bern}(p)$ .

Derive the score $s_p(X)$ .
Compute the Fisher information $I(p)$ .
Explain where on $(0,1)$ the model is most locally informative.

Code cell 5

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 6

# Solution
# Exercise 1: Solution
import numpy as np

def header(title):
    print("\n" + "=" * 78)
    print(title)
    print("=" * 78)

def check_close(name, value, target, tol=1e-8):
    ok = np.allclose(value, target, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    return ok

header("Exercise 1: Bernoulli Fisher")
p = 0.3
fisher = 1.0 / (p * (1 - p))
check_close("I(p) = 1/(p(1-p))", fisher, 1.0 / (p * (1 - p)))
print("The model is most locally informative near the boundaries p -> 0 or p -> 1.")
print("\nTakeaway: Bernoulli Fisher grows near the boundaries because rare outcomes are highly informative about edge probabilities.")

Exercise 2 [*] — Additivity

Show that if $X_1,\dots,X_n$ are iid from a regular scalar model, then $I_n(\theta) = nI(\theta)$ . Verify the formula numerically for the Gaussian mean model with known variance.

Code cell 8

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 9

# Solution
# Exercise 2: Solution
import numpy as np

def header(title):
    print("\n" + "=" * 78)
    print(title)
    print("=" * 78)

def check_close(name, value, target, tol=1e-8):
    ok = np.allclose(value, target, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    return ok

header("Exercise 2: Additivity")
n = 20
sigma2 = 2.0
single = 1.0 / sigma2
total = n * single
check_close("I_n = n I", total, 10.0)
print("\nTakeaway: Fisher information accumulates linearly across independent observations.")

Exercise 3 [*] — Reparameterization

For the Bernoulli model, compute Fisher information in probability coordinates $p$ and in logit coordinates $\phi = \log\frac{p}{1-p}$ . Verify the transformation law.

Code cell 11

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 12

# Solution
# Exercise 3: Solution
import numpy as np

def header(title):
    print("\n" + "=" * 78)
    print(title)
    print("=" * 78)

def check_close(name, value, target, tol=1e-8):
    ok = np.allclose(value, target, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    return ok

header("Exercise 3: Reparameterization")
p = 0.25
fisher_p = 1.0 / (p * (1 - p))
dp_dphi = p * (1 - p)
fisher_phi = fisher_p * dp_dphi**2
check_close("I_phi = I_p (dp/dphi)^2", fisher_phi, p * (1 - p))
print("\nTakeaway: Raw Fisher values change under coordinates, but the metric meaning is preserved.")

Exercise 4 [**] — Gaussian matrix Fisher

Let $X \sim \mathcal{N}(\boldsymbol{\mu}, \Sigma)$ with known covariance matrix $\Sigma$ . Derive the Fisher information matrix for $\boldsymbol{\mu}$ and explain what its eigenvalues mean.

Code cell 14

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 15

# Solution
# Exercise 4: Solution
import numpy as np

def header(title):
    print("\n" + "=" * 78)
    print(title)
    print("=" * 78)

def check_close(name, value, target, tol=1e-8):
    ok = np.allclose(value, target, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    return ok

header("Exercise 4: Gaussian matrix Fisher")
Sigma = np.array([[2.0, 0.5], [0.5, 1.0]])
fisher = np.linalg.inv(Sigma)
check_close("Fisher equals Sigma^{-1}", fisher @ Sigma, np.eye(2), tol=1e-7)
print("Eigenvalues describe directional information strength in parameter space.")
print("\nTakeaway: In Gaussian location models, covariance and Fisher information are exact inverses.")

Exercise 5 [**] — Local KL curvature

Show for the Bernoulli model that $D_{\mathrm{KL}}(p_\theta \| p_{\theta+\delta}) = \frac{1}{2}I(\theta)\delta^2 + o(\delta^2)$ . Verify this numerically for a small displacement.

Code cell 17

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 18

# Solution
# Exercise 5: Solution
import numpy as np

def header(title):
    print("\n" + "=" * 78)
    print(title)
    print("=" * 78)

def check_close(name, value, target, tol=1e-8):
    ok = np.allclose(value, target, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    return ok

header("Exercise 5: Local KL curvature")
p = 0.4
delta = 0.01
q = p + delta
kl = p * np.log(p / q) + (1 - p) * np.log((1 - p) / (1 - q))
fisher = 1.0 / (p * (1 - p))
quad = 0.5 * fisher * delta**2
check_close("Local KL matches the Fisher quadratic term", kl, quad, tol=5e-6)
print("\nTakeaway: Fisher information is the second-order curvature hidden inside local KL divergence.")

Exercise 6 [**] — Jeffreys prior

Derive Jeffreys prior for Bernoulli( $p$ ) and for the exponential rate parameter $\lambda$ . State which prior is proper and which is improper.

Code cell 20

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 21

# Solution
# Exercise 6: Solution
import numpy as np

def header(title):
    print("\n" + "=" * 78)
    print(title)
    print("=" * 78)

def check_true(name, condition):
    print(f"{'PASS' if condition else 'FAIL'} - {name}")
    return condition

header("Exercise 6: Jeffreys prior")
print("Bernoulli: pi_J(p) proportional to 1/sqrt(p(1-p)) -> proper Beta(1/2, 1/2).")
print("Exponential rate: pi_J(lambda) proportional to 1/lambda -> improper on (0, infinity).")
check_true("Bernoulli Jeffreys prior is proper", True)
check_true("Exponential Jeffreys prior is improper", True)
print("\nTakeaway: Jeffreys priors are invariant by construction, but not always normalizable.")

Exercise 7 [***] — Empirical Fisher versus Hessian

Build a toy logistic-regression example and compare the empirical Fisher with the observed Hessian. Explain why they need not match on finite data.

Code cell 23

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 24

# Solution
# Exercise 7: Solution
import numpy as np

def header(title):
    print("\n" + "=" * 78)
    print(title)
    print("=" * 78)

def check_true(name, condition):
    print(f"{'PASS' if condition else 'FAIL'} - {name}")
    return condition

def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

header("Exercise 7: Empirical Fisher versus Hessian")
rng = np.random.default_rng(0)
X = rng.normal(size=(300, 2))
w = np.array([0.8, -0.4])
probs = sigmoid(X @ w)
y = rng.binomial(1, probs)
W = probs * (1 - probs)
hessian = (X.T * W) @ X / len(X)
grads = ((y - probs)[:, None]) * X
empirical = grads.T @ grads / len(X)
diff = np.linalg.norm(empirical - hessian)
print("||Empirical Fisher - Hessian||_F =", diff)
check_true("The difference is nonzero on finite data", diff > 1e-4)
print("\nTakeaway: The empirical Fisher is a convenient approximation, not the same object by definition.")

Exercise 8 [***] — EWC-style Fisher importance

Use a diagonal Fisher estimate from a first binary-classification task to construct an EWC penalty. Show that larger moves along important parameters incur larger penalties.

Code cell 26

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 27

# Solution
# Exercise 8: Solution
import numpy as np

def header(title):
    print("\n" + "=" * 78)
    print(title)
    print("=" * 78)

def check_true(name, condition):
    print(f"{'PASS' if condition else 'FAIL'} - {name}")
    return condition

def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

header("Exercise 8: EWC-style Fisher importance")
rng = np.random.default_rng(1)
X = rng.normal(size=(300, 2))
w_star = np.array([0.9, -0.7])
probs = sigmoid(X @ w_star)
y = rng.binomial(1, probs)
diag_fisher = np.mean((((y - probs)[:, None]) * X) ** 2, axis=0)
old = w_star
big_move = np.array([1.4, -0.1])
small_move = np.array([0.95, -0.65])
penalty_big = 0.5 * np.sum(diag_fisher * (big_move - old) ** 2)
penalty_small = 0.5 * np.sum(diag_fisher * (small_move - old) ** 2)
print("big-move penalty  =", penalty_big)
print("small-move penalty=", penalty_small)
check_true("Larger movement creates a larger EWC penalty", penalty_big > penalty_small)
print("\nTakeaway: EWC reuses Fisher information as a local importance weighting on parameters.")

Exercise 9: Bernoulli Fisher Information

Compute the Fisher information of a Bernoulli model and verify the closed form $I(p)=1/(p(1-p))$ away from the boundary.

Code cell 29

# Your Solution
print("Compute Bernoulli Fisher information at p=0.3.")

Code cell 30

# Solution
header("Exercise 9: Bernoulli Fisher")
p = 0.3
score1 = 1 / p
score0 = -1 / (1 - p)
I = p * score1**2 + (1-p) * score0**2
closed = 1 / (p * (1-p))
print("computed:", round(float(I), 6))
print("closed form:", round(float(closed), 6))
check_close("Bernoulli Fisher", I, closed)
print("Takeaway: Fisher information blows up near p=0 or p=1 because the model becomes locally sensitive.")

Exercise 10: Natural Gradient Rescaling

Compare an ordinary gradient step and a natural-gradient step for a two-parameter diagonal Fisher matrix.

Code cell 32

# Your Solution
print("Rescale a gradient by inverse Fisher curvature.")

Code cell 33

# Solution
header("Exercise 10: Natural Gradient")
g = np.array([2.0, 2.0])
F = np.diag([10.0, 0.5])
eta = 0.1
ordinary = -eta * g
natural = -eta * np.linalg.solve(F, g)
print("ordinary step:", ordinary)
print("natural step:", natural)
check_true("high-curvature coordinate is damped", abs(natural[0]) < abs(ordinary[0]))
check_true("low-curvature coordinate is amplified", abs(natural[1]) > abs(ordinary[1]))
print("Takeaway: natural gradient measures step size in distribution space, not raw parameter space.")