Exercises Notebook
Converted from
exercises.ipynbfor web reading.
Mutual Information -- Exercises
8 exercises covering mutual information from first-principles tabular calculations to contrastive learning and active learning.
| Difficulty | Exercises | Focus |
|---|---|---|
| ★ | 1-3 | definitions, exact computation, structural identities |
| ★★ | 4-6 | chain rules, channel calculations, estimation |
| ★★★ | 7-8 | InfoNCE and modern ML applications |
Each exercise has three cells:
- Problem -- the task statement
- Your Solution -- a runnable scaffold
- Solution -- a complete reference implementation with checks
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import numpy as np
import numpy.linalg as la
try:
import matplotlib.pyplot as plt
HAS_MPL = True
except ImportError:
HAS_MPL = False
np.set_printoptions(precision=6, suppress=True)
np.random.seed(42)
def header(title):
print('\\n' + '=' * len(title))
print(title)
print('=' * len(title))
def check_close(name, got, expected, tol=1e-8):
ok = np.allclose(got, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} — {name}")
if not ok:
print(' expected:', expected)
print(' got :', got)
return ok
def check_true(name, cond):
print(f"{'PASS' if cond else 'FAIL'} — {name}")
return cond
def entropy(p):
p = np.asarray(p, dtype=float)
p = p[p > 0]
return -np.sum(p * np.log(p))
def mutual_info_from_joint(P):
P = np.asarray(P, dtype=float)
px = P.sum(axis=1, keepdims=True)
py = P.sum(axis=0, keepdims=True)
with np.errstate(divide='ignore', invalid='ignore'):
ratio = np.where(P > 0, P / (px @ py), 1.0)
terms = np.where(P > 0, P * np.log(ratio), 0.0)
return float(np.sum(terms))
def conditional_mi_from_joint_xyz(Pxyz):
Pxyz = np.asarray(Pxyz, dtype=float)
Pz = Pxyz.sum(axis=(0, 1), keepdims=True)
Pxz = Pxyz.sum(axis=1, keepdims=True)
Pyz = Pxyz.sum(axis=0, keepdims=True)
with np.errstate(divide='ignore', invalid='ignore'):
num = Pxyz * Pz
den = Pxz * Pyz
ratio = np.where(Pxyz > 0, num / den, 1.0)
terms = np.where(Pxyz > 0, Pxyz * np.log(ratio), 0.0)
return float(np.sum(terms))
def histogram_mi(x, y, bins=20):
hist, _, _ = np.histogram2d(x, y, bins=bins)
P = hist / hist.sum()
return mutual_info_from_joint(P)
def binary_entropy(p):
p = np.clip(np.asarray(p, dtype=float), 1e-12, 1 - 1e-12)
return -(p * np.log(p) + (1 - p) * np.log(1 - p))
print('Setup complete.')
Exercise 1 ★ -- Tabular Mutual Information
For the joint table
(a) compute directly.
(b) verify symmetry by comparing with .
(c) verify the entropy identity .
Code cell 5
# Your Solution
print("Write your solution here, then compare with the reference solution below.")
Code cell 6
# Solution
# Exercise 1: Solution
P = np.array([[0.40, 0.10],
[0.10, 0.40]])
mi_xy = mutual_info_from_joint(P)
mi_yx = mutual_info_from_joint(P.T)
hx = entropy(P.sum(axis=1))
hy = entropy(P.sum(axis=0))
hxy = entropy(P.ravel())
header('Exercise 1: Tabular Mutual Information')
print(f'I(X;Y) = {mi_xy:.6f}')
print(f'I(Y;X) = {mi_yx:.6f}')
print(f'H(X)+H(Y)-H(X,Y) = {hx + hy - hxy:.6f}')
check_close('symmetry', mi_xy, mi_yx)
check_close('entropy identity', mi_xy, hx + hy - hxy)
print('\\nTakeaway: mutual information can be computed exactly from a joint table and is symmetric.')
Exercise 2 ★ -- Zero Correlation, Positive Mutual Information
Let and with independent Gaussian noise.
(a) estimate the correlation between and .
(b) estimate the mutual information using a simple histogram estimator.
(c) explain why the two quantities tell different stories.
Code cell 8
# Your Solution
print("Write your solution here, then compare with the reference solution below.")
Code cell 9
# Solution
# Exercise 2: Solution
np.random.seed(42)
N = 5000
X = np.random.randn(N)
Y = X**2 + 0.1 * np.random.randn(N)
corr = np.corrcoef(X, Y)[0, 1]
mi_est = histogram_mi(X, Y, bins=30)
header('Exercise 2: Zero Correlation, Positive MI')
print(f'Correlation estimate: {corr:.6f}')
print(f'Histogram MI estimate: {mi_est:.6f}')
check_true('correlation is small in magnitude', abs(corr) < 0.1)
check_true('mutual information is positive', mi_est > 0.2)
print('\\nTakeaway: correlation can miss nonlinear dependence, but mutual information still detects it.')
Exercise 3 ★ -- Mutual Information as KL Divergence
For the same table as Exercise 1, compute
and verify that it equals mutual information.
Also check that if the joint table factorizes exactly, the mutual information is zero.
Code cell 11
# Your Solution
print("Write your solution here, then compare with the reference solution below.")
Code cell 12
# Solution
# Exercise 3: Solution
P = np.array([[0.40, 0.10],
[0.10, 0.40]])
product = np.outer(P.sum(axis=1), P.sum(axis=0))
kl_val = np.sum(np.where(P > 0, P * np.log(P / product), 0.0))
mi_val = mutual_info_from_joint(P)
P_ind = np.outer(np.array([0.5, 0.5]), np.array([0.6, 0.4]))
mi_ind = mutual_info_from_joint(P_ind)
header('Exercise 3: MI as KL Divergence')
print(f'KL(P_xy || P_x P_y) = {kl_val:.6f}')
print(f'I(X;Y) = {mi_val:.6f}')
print(f'I(X;Y) for independent table = {mi_ind:.6f}')
check_close('MI equals KL to independence', mi_val, kl_val)
check_close('independent table has zero MI', mi_ind, 0.0)
print('\\nTakeaway: mutual information is exactly the KL divergence from the true joint law to the independence model.')
Exercise 4 ★★ -- Chain Rule and Conditional Mutual Information
Construct a three-variable discrete system and verify
Use the synthetic model from the theory notebook: independently and with .
Code cell 14
# Your Solution
print("Write your solution here, then compare with the reference solution below.")
Code cell 15
# Solution
# Exercise 4: Solution
np.random.seed(11)
N = 200000
X = np.random.binomial(1, 0.5, size=N)
Z = np.random.binomial(1, 0.5, size=N)
Y = (X + Z + np.random.binomial(1, 0.2, size=N)) % 2
Pxyz = np.zeros((2, 2, 2), dtype=float)
for x, z, y in zip(X, Z, Y):
Pxyz[x, z, y] += 1
Pxyz /= N
lhs = mutual_info_from_joint(Pxyz.reshape(4, 2))
rhs = mutual_info_from_joint(Pxyz.sum(axis=1)) + conditional_mi_from_joint_xyz(np.transpose(Pxyz, (1, 2, 0)))
header('Exercise 4: Chain Rule')
print(f'I((X,Z);Y) = {lhs:.6f}')
print(f'I(X;Y)+I(Z;Y|X) = {rhs:.6f}')
check_close('chain rule', lhs, rhs, tol=2e-3)
print('\\nTakeaway: chain rules decompose information into what is already explained and what is added conditionally.')
Exercise 5 ★★ -- Binary Symmetric Channel and Capacity
For a binary symmetric channel with flip probability and a uniform input,
in nats.
(a) compute the mutual information at .
(b) verify that the mutual information decreases as noise increases.
Code cell 17
# Your Solution
print("Write your solution here, then compare with the reference solution below.")
Code cell 18
# Solution
# Exercise 5: Solution
epsilons = np.array([0.10, 0.25, 0.49])
mi_vals = np.log(2) - binary_entropy(epsilons)
header('Exercise 5: Binary Symmetric Channel')
for e, m in zip(epsilons, mi_vals):
print(f'epsilon={e:.2f} -> I(X;Y)={m:.6f} nats')
check_true('mutual information decreases with noise', np.all(np.diff(mi_vals) < 0))
print('\\nTakeaway: channel noise destroys information, and capacity falls accordingly.')
Exercise 6 ★★ -- Estimator Bias in Practice
Estimate mutual information for a correlated Gaussian pair with a simple histogram estimator. Use and compare the estimate with the exact Gaussian formula
Repeat for two different bin counts and interpret the difference.
Code cell 20
# Your Solution
print("Write your solution here, then compare with the reference solution below.")
Code cell 21
# Solution
# Exercise 6: Solution
np.random.seed(5)
rho = 0.6
cov = np.array([[1.0, rho], [rho, 1.0]])
XY = np.random.multivariate_normal([0.0, 0.0], cov, size=4000)
true_mi = -0.5 * np.log(1 - rho**2)
est_12 = histogram_mi(XY[:, 0], XY[:, 1], bins=12)
est_30 = histogram_mi(XY[:, 0], XY[:, 1], bins=30)
header('Exercise 6: Estimator Bias')
print(f'True Gaussian MI = {true_mi:.6f}')
print(f'Histogram estimate (12 bins) = {est_12:.6f}')
print(f'Histogram estimate (30 bins) = {est_30:.6f}')
check_true('estimates are positive', est_12 > 0 and est_30 > 0)
check_true('different bin counts give different estimates', abs(est_12 - est_30) > 1e-3)
print('\\nTakeaway: mutual-information estimation is sensitive to estimator design even on simple continuous problems.')
Exercise 7 ★★★ -- InfoNCE as a Lower-Bound-Style Objective
Use correlated Gaussian views with correlation . Compute an InfoNCE-style lower bound with and negatives and compare with the exact Gaussian MI.
You do not need to prove the bound here; focus on the computation and interpretation.
Code cell 23
# Your Solution
print("Write your solution here, then compare with the reference solution below.")
Code cell 24
# Solution
# Exercise 7: Solution
np.random.seed(123)
N = 3000
rho = 0.8
z1 = np.random.randn(N, 1)
z2 = rho * z1 + np.sqrt(1 - rho**2) * np.random.randn(N, 1)
def infonce_bound(K, temperature=0.2):
idx = np.random.choice(N, size=400, replace=False)
x = z1[idx]
y_pos = z2[idx]
pos = (x * y_pos)[:, 0] / temperature
losses = []
for i, score in enumerate(pos):
neg_idx = np.random.choice(N, size=K - 1, replace=False)
neg = (x[i] * z2[neg_idx]).reshape(-1) / temperature
all_scores = np.concatenate([[score], neg])
losses.append(-score + np.log(np.sum(np.exp(all_scores))))
return np.log(K) - np.mean(losses)
bound_8 = infonce_bound(8)
bound_64 = infonce_bound(64)
true_mi = -0.5 * np.log(1 - rho**2)
header('Exercise 7: InfoNCE Lower Bound')
print(f'Bound with K=8: {bound_8:.6f}')
print(f'Bound with K=64: {bound_64:.6f}')
print(f'True Gaussian MI: {true_mi:.6f}')
check_true('bounds are below the true MI up to small estimator noise', bound_8 <= true_mi + 0.5 and bound_64 <= true_mi + 0.5)
check_true('more negatives usually tighten the bound', bound_64 >= bound_8 - 1e-6)
print('\\nTakeaway: InfoNCE behaves like a practical lower-bound objective, not the mutual information itself.')
Exercise 8 ★★★ -- Active Learning via Mutual Information
Suppose the model state is equally likely a priori. For a candidate input , the unknown label is Bernoulli with parameter:
Compute the BALD-style score for three candidates and choose the best one.
Candidates:
- uninformative:
- moderate:
- very informative:
Code cell 26
# Your Solution
print("Write your solution here, then compare with the reference solution below.")
Code cell 27
# Solution
# Exercise 8: Solution
candidates = {
'uninformative': (0.50, 0.50),
'moderate': (0.75, 0.25),
'very informative': (0.95, 0.05),
}
def bald_score(p0, p1):
p_theta = np.array([0.5, 0.5])
p_y1 = np.dot(p_theta, [p0, p1])
H_y = binary_entropy(p_y1)
H_y_given_theta = 0.5 * binary_entropy(p0) + 0.5 * binary_entropy(p1)
return H_y - H_y_given_theta
scores = {name: bald_score(*vals) for name, vals in candidates.items()}
best = max(scores, key=scores.get)
header('Exercise 8: Active Learning via Mutual Information')
for name, score in scores.items():
print(f"{name:>16}: {score:.6f} nats")
print(f'Best query: {best}')
check_true('the most separated candidate is most informative', best == 'very informative')
check_true('uninformative query has zero information gain', abs(scores['uninformative']) < 1e-10)
print('\\nTakeaway: active learning prefers queries whose labels would most reduce uncertainty about the model state.')
Exercise 9: Binary Symmetric Channel Capacity
Compute for a binary symmetric channel and verify that uniform inputs maximize the mutual information.
Code cell 29
# Your Solution
print("Evaluate mutual information for several input biases.")
Code cell 30
# Solution
header("Exercise 9: Binary Symmetric Channel")
eps = 0.15
def bsc_joint(px1):
px = np.array([1-px1, px1])
channel = np.array([[1-eps, eps], [eps, 1-eps]])
return px[:, None] * channel
vals = [(a, mutual_info_from_joint(bsc_joint(a))) for a in np.linspace(0.05, 0.95, 19)]
best_a, best_mi = max(vals, key=lambda t: t[1])
print("best input probability:", round(float(best_a), 3))
print("best MI:", round(float(best_mi), 6))
check_close("capacity at uniform input", best_a, 0.5, tol=0.05)
print("Takeaway: symmetric channels are maximized by maximum-entropy inputs.")
Exercise 10: InfoNCE Signal With Better Positives
Build a tiny similarity matrix and show that increasing positive-pair scores improves the InfoNCE lower-bound surrogate.
Code cell 32
# Your Solution
print("Compare InfoNCE losses for weak and strong positive-pair scores.")
Code cell 33
# Solution
header("Exercise 10: InfoNCE Signal")
def info_nce_loss(S):
S = S - S.max(axis=1, keepdims=True)
log_probs = S - np.log(np.sum(np.exp(S), axis=1, keepdims=True))
return float(-np.mean(np.diag(log_probs)))
weak = np.array([[1.0, 0.8, 0.2], [0.7, 1.0, 0.5], [0.1, 0.4, 1.0]])
strong = weak + np.eye(3) * 1.5
loss_weak = info_nce_loss(weak)
loss_strong = info_nce_loss(strong)
print("weak loss:", round(loss_weak, 6))
print("strong loss:", round(loss_strong, 6))
check_true("strong positives reduce loss", loss_strong < loss_weak)
print("Takeaway: contrastive learning rewards representations that identify matched pairs among negatives.")