Principal Component Analysis

Exercises Notebook

Converted from exercises.ipynb for web reading.

Principal Component Analysis - Exercises

This notebook contains 10 progressive exercises for 03-Principal-Component-Analysis. Each exercise has a learner workspace followed by a complete reference solution. Use the solution cells after making a serious attempt.

Difficulty grows from direct computation to AI-facing interpretation. Formulas use LaTeX-in-Markdown with $...$ and `

...

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import numpy.linalg as la
import scipy.linalg as sla
from scipy import stats

np.set_printoptions(precision=8, suppress=True)
np.random.seed(42)

COLORS = {
    "primary": "#0077BB",
    "secondary": "#EE7733",
    "tertiary": "#009988",
    "error": "#CC3311",
    "neutral": "#555555",
    "highlight": "#EE3377",
}

def header(title):
    print("\n" + "=" * len(title))
    print(title)
    print("=" * len(title))

def check_true(name, cond):
    ok = bool(cond)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    return ok

def check_close(name, got, expected, tol=1e-8):
    ok = np.allclose(got, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    if not ok:
        print("  got     =", got)
        print("  expected=", expected)
    return ok

def softmax(z, axis=-1):
    z = np.asarray(z, dtype=float)
    z = z - np.max(z, axis=axis, keepdims=True)
    e = np.exp(z)
    return e / np.sum(e, axis=axis, keepdims=True)

def gram_schmidt_columns(A, tol=1e-12):
    A = np.asarray(A, dtype=float)
    Q = []
    for j in range(A.shape[1]):
        v = A[:, j].copy()
        for q in Q:
            v -= (q @ v) * q
        n = la.norm(v)
        if n > tol:
            Q.append(v / n)
    return np.column_stack(Q) if Q else np.empty((A.shape[0], 0))

def projection_matrix(A):
    Q = gram_schmidt_columns(A)
    return Q @ Q.T

def numerical_rank(A, tol=1e-10):
    return int(np.sum(la.svd(np.asarray(A, dtype=float), compute_uv=False) > tol))

def stable_rank(A):
    s = la.svd(np.asarray(A, dtype=float), compute_uv=False)
    return float(np.sum(s**2) / (s[0]**2 + 1e-15))

def make_spd(n, seed=0, ridge=0.5):
    rng = np.random.default_rng(seed)
    A = rng.normal(size=(n, n))
    return A.T @ A + ridge * np.eye(n)

print("Chapter 03 helper setup complete.")

Exercise 1: Center Data and Covariance

Compute centered data and the sample covariance matrix $C=X_c^T X_c/(n-1)$ .

Code cell 5

# Your Solution
# Exercise 1 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 1.")

Code cell 6

# Solution
# Exercise 1 - Center Data and Covariance
header("Exercise 1: covariance")
X = np.array([[1.0, 2.0], [2.0, 1.0], [3.0, 4.0], [4.0, 3.0]])
Xc = X - X.mean(axis=0)
C = Xc.T @ Xc / (X.shape[0] - 1)
print("mean:", X.mean(axis=0))
print("covariance:\n", C)
check_close("centered mean zero", Xc.mean(axis=0), np.zeros(2))
check_close("cov symmetric", C, C.T)

Exercise 2: PCA by Eigendecomposition

Find principal components as eigenvectors of the covariance matrix.

Code cell 8

# Your Solution
# Exercise 2 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 2.")

Code cell 9

# Solution
# Exercise 2 - PCA by Eigendecomposition
header("Exercise 2: eig PCA")
rng = np.random.default_rng(0)
X = rng.normal(size=(100, 2)) @ np.array([[3.0, 0.0], [1.0, 0.5]])
Xc = X - X.mean(axis=0)
C = Xc.T @ Xc / 99
vals, vecs = la.eigh(C)
order = np.argsort(vals)[::-1]
vals, vecs = vals[order], vecs[:, order]
print("explained variances:", vals)
check_true("sorted descending", vals[0] >= vals[1])
check_close("components orthonormal", vecs.T @ vecs, np.eye(2))

Exercise 3: PCA by SVD

Show that right singular vectors of centered data are PCA directions.

Code cell 11

# Your Solution
# Exercise 3 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 3.")

Code cell 12

# Solution
# Exercise 3 - PCA by SVD
header("Exercise 3: SVD PCA")
rng = np.random.default_rng(1)
X = rng.normal(size=(80, 3)) @ np.diag([4.0, 1.0, 0.2])
Xc = X - X.mean(axis=0)
U, s, Vt = la.svd(Xc, full_matrices=False)
vars_svd = s**2 / (X.shape[0] - 1)
vars_cov = np.sort(la.eigvalsh(Xc.T @ Xc / 79))[::-1]
print("SVD variances:", vars_svd)
check_close("SVD variances match covariance eigenvalues", vars_svd, vars_cov)

Exercise 4: Reconstruction Error

Project onto the top $k$ components and compare reconstruction error to discarded eigenvalues.

Code cell 14

# Your Solution
# Exercise 4 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 4.")

Code cell 15

# Solution
# Exercise 4 - Reconstruction Error
header("Exercise 4: reconstruction error")
rng = np.random.default_rng(2)
X = rng.normal(size=(60, 4)) @ np.diag([5.0, 2.0, 0.5, 0.1])
Xc = X - X.mean(axis=0)
U, s, Vt = la.svd(Xc, full_matrices=False)
k = 2
Xhat = U[:, :k] @ np.diag(s[:k]) @ Vt[:k]
err = la.norm(Xc - Xhat, 'fro')**2
print("squared reconstruction error:", err)
check_close("discarded singular energy", err, np.sum(s[k:]**2))

Exercise 5: PCA Whitening

Transform data so the empirical covariance is approximately the identity.

Code cell 17

# Your Solution
# Exercise 5 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 5.")

Code cell 18

# Solution
# Exercise 5 - PCA Whitening
header("Exercise 5: whitening")
rng = np.random.default_rng(3)
X = rng.normal(size=(200, 2)) @ np.array([[2.0, 0.7], [0.0, 0.4]])
Xc = X - X.mean(axis=0)
C = Xc.T @ Xc / 199
vals, Q = la.eigh(C)
W = Q @ np.diag(1 / np.sqrt(vals)) @ Q.T
Z = Xc @ W
Cz = Z.T @ Z / 199
print("whitened covariance:\n", Cz)
check_close("covariance near identity", Cz, np.eye(2), tol=1e-10)

Exercise 6: Choose $k$ by Explained Variance

Select the smallest $k$ with cumulative explained variance above $95\%$ .

Code cell 20

# Your Solution
# Exercise 6 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 6.")

Code cell 21

# Solution
# Exercise 6 - Choose $k$ by Explained Variance
header("Exercise 6: choose k")
s = np.array([10.0, 4.0, 2.0, 1.0, 0.5])
var = s**2
cum = np.cumsum(var) / var.sum()
k = int(np.searchsorted(cum, 0.95) + 1)
print("cumulative explained variance:", cum)
print("selected k:", k)
check_true("threshold reached", cum[k-1] >= 0.95)
check_true("minimal", k == 2)

Exercise 7: Kernel PCA Centering

Center an RBF kernel matrix and verify the centered Gram matrix has zero row sums.

Code cell 23

# Your Solution
# Exercise 7 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 7.")

Code cell 24

# Solution
# Exercise 7 - Kernel PCA Centering
header("Exercise 7: kernel centering")
X = np.linspace(-2, 2, 8)[:, None]
D2 = (X - X.T)**2
K = np.exp(-D2 / 2)
n = K.shape[0]
H = np.eye(n) - np.ones((n, n))/n
Kc = H @ K @ H
vals = la.eigvalsh(Kc)
print("row sums:", Kc.sum(axis=1))
print("min eigenvalue:", vals[0])
check_close("centered rows sum to zero", Kc.sum(axis=1), np.zeros(n), tol=1e-12)
check_true("PSD up to numerical error", vals[0] > -1e-10)

Exercise 8: Randomized PCA

Approximate the top principal subspace using a random range finder.

Code cell 26

# Your Solution
# Exercise 8 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 8.")

Code cell 27

# Solution
# Exercise 8 - Randomized PCA
header("Exercise 8: randomized PCA")
rng = np.random.default_rng(4)
X = rng.normal(size=(120, 6)) @ np.diag([5, 3, 1, .5, .2, .1])
Xc = X - X.mean(axis=0)
Omega = rng.normal(size=(6, 3))
Q, _ = la.qr(Xc @ Omega, mode='reduced')
B = Q.T @ Xc
s_sketch = la.svd(B, compute_uv=False)
s_full = la.svd(Xc, compute_uv=False)
print("full top singular value", s_full[0], "sketch", s_sketch[0])
check_true("top component captured", abs(s_full[0]-s_sketch[0])/s_full[0] < 0.05)

Exercise 9: LoRA Rank from Spectrum

Use a singular spectrum to estimate the rank needed for a low-rank update.

Code cell 29

# Your Solution
# Exercise 9 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 9.")

Code cell 30

# Solution
# Exercise 9 - LoRA Rank from Spectrum
header("Exercise 9: LoRA rank selection")
s = np.array([9.0, 5.0, 2.0, 0.8, 0.2, 0.1])
energy = np.cumsum(s**2) / np.sum(s**2)
r = int(np.searchsorted(energy, 0.98) + 1)
print("energy curve:", energy)
print("rank for 98 percent energy:", r)
check_true("small low-rank adapter enough", r < len(s))

Exercise 10: Intrinsic Dimension

Compute participation-ratio dimension $(\sum \lambda_i)^2/\sum \lambda_i^2$ .

Code cell 32

# Your Solution
# Exercise 10 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 10.")

Code cell 33

# Solution
# Exercise 10 - Intrinsic Dimension
header("Exercise 10: intrinsic dimension")
lams = np.array([8.0, 3.0, 1.0, 0.5, 0.2])
d_eff = (lams.sum()**2) / np.sum(lams**2)
print("effective dimension:", d_eff)
check_true("between 1 and ambient dimension", 1 <= d_eff <= len(lams))
check_true("low-dimensional spectrum", d_eff < len(lams))