Theory Notebook
Converted from
theory.ipynbfor web reading.
Principal Component Analysis — Theory Notebook
"Give me the directions of maximum variance and I will show you the geometry of the data."
Interactive demonstrations covering: PCA geometry, variance maximization, SVD connection, whitening, probabilistic PCA, kernel PCA, and AI applications including intrinsic dimensionality and LoRA analysis.
Run cells top-to-bottom. All random seeds are fixed for reproducibility.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import numpy as np
import scipy.linalg as la
from scipy import stats
try:
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.patches import Ellipse
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = [10, 6]
plt.rcParams['font.size'] = 12
plt.rcParams['axes.titlesize'] = 13
HAS_MPL = True
except ImportError:
HAS_MPL = False
print('matplotlib not available — skipping visualizations')
try:
import seaborn as sns
HAS_SNS = True
except ImportError:
HAS_SNS = False
np.set_printoptions(precision=6, suppress=True)
np.random.seed(42)
print('Setup complete. NumPy:', np.__version__)
1. Intuition — The Geometry of PCA
PCA finds the axes of greatest variance in data. We visualize this for 2D data: the covariance ellipse and its principal axes.
Code cell 5
# === 1.1 PCA Geometry in 2D ===
# Generate correlated 2D Gaussian data
np.random.seed(42)
n = 300
mean = np.array([2.0, 3.0])
# Covariance: strong correlation between x1 and x2
cov = np.array([[3.0, 2.0],
[2.0, 2.0]])
X = np.random.multivariate_normal(mean, cov, n)
# PCA via SVD
X_c = X - X.mean(axis=0)
U, s, Vt = np.linalg.svd(X_c, full_matrices=False)
lambdas = s**2 / (n - 1) # explained variances
print('Sample covariance matrix:')
C = X_c.T @ X_c / (n - 1)
print(np.round(C, 4))
print(f'\nTrue covariance: [[3.0, 2.0], [2.0, 2.0]]')
print(f'\nExplained variances: lambda1={lambdas[0]:.4f}, lambda2={lambdas[1]:.4f}')
print(f'Total variance: {lambdas.sum():.4f} (should ≈ tr(cov) = {np.trace(cov):.1f})')
print(f'\nPC1 direction: {Vt[0]}')
print(f'PC2 direction: {Vt[1]}')
print(f'PC1 · PC2 = {Vt[0] @ Vt[1]:.2e} (should be ~0 — orthogonal)')
# Verify: PC directions are eigenvectors of C
evals, evecs = np.linalg.eigh(C)
evals = evals[::-1]; evecs = evecs[:, ::-1]
ok1 = np.allclose(np.abs(Vt[0]), np.abs(evecs[:, 0]))
print(f'\nPASS: SVD right singular vectors match covariance eigenvectors: {ok1}')
Code cell 6
# === 1.2 Visualize 2D PCA geometry ===
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# Left: original data with PC arrows
ax = axes[0]
ax.scatter(X[:, 0], X[:, 1], alpha=0.3, s=15, c='steelblue')
mu = X.mean(axis=0)
scale = np.sqrt(lambdas)
colors = ['crimson', 'darkorange']
for i, (color, label) in enumerate(zip(colors, ['PC1', 'PC2'])):
ax.annotate('', xy=mu + 2*scale[i]*Vt[i], xytext=mu,
arrowprops=dict(arrowstyle='->', color=color, lw=2.5))
ax.annotate('', xy=mu - 2*scale[i]*Vt[i], xytext=mu,
arrowprops=dict(arrowstyle='->', color=color, lw=2.5))
ax.text(*(mu + 2.2*scale[i]*Vt[i]), label, color=color, fontsize=12, fontweight='bold')
ax.set_title('Original Data with Principal Components')
ax.set_xlabel('$x_1$'); ax.set_ylabel('$x_2$')
ax.set_aspect('equal')
# Right: PCA-rotated data (scores)
ax = axes[1]
Z = U * s # scores
ax.scatter(Z[:, 0], Z[:, 1], alpha=0.3, s=15, c='steelblue')
ax.axhline(0, color='crimson', lw=1.5, ls='--')
ax.axvline(0, color='darkorange', lw=1.5, ls='--')
ax.set_title('PCA Scores (Rotated to PC axes)')
ax.set_xlabel(f'PC1 ({lambdas[0]/lambdas.sum():.1%} var)')
ax.set_ylabel(f'PC2 ({lambdas[1]/lambdas.sum():.1%} var)')
ax.set_aspect('equal')
# Verify decorrelation
C_scores = np.cov(Z.T)
print(f'Score covariance (should be diagonal):\n{np.round(C_scores, 4)}')
plt.tight_layout()
plt.show()
else:
print('Skipping visualization (no matplotlib)')
2. Formal Definitions — Derivation from First Principles
We derive PCA from the variance-maximization principle and verify that the SVD gives the same answer as eigendecomposition of the covariance matrix.
Code cell 8
# === 2.1 Variance Maximization: Optimization Landscape ===
# For 2D data, visualize vᵀCv for all unit vectors v = (cos θ, sin θ)
np.random.seed(42)
n = 200
cov = np.array([[4.0, 2.0], [2.0, 1.5]])
X = np.random.multivariate_normal([0, 0], cov, n)
X_c = X - X.mean(axis=0)
C = X_c.T @ X_c / (n - 1)
theta_vals = np.linspace(0, np.pi, 500)
variances = []
for theta in theta_vals:
v = np.array([np.cos(theta), np.sin(theta)])
variances.append(v @ C @ v)
variances = np.array(variances)
# Find maximum via PCA
_, s, Vt = np.linalg.svd(X_c, full_matrices=False)
pc1_angle = np.arctan2(Vt[0, 1], Vt[0, 0]) % np.pi
max_var = s[0]**2 / (n - 1)
print(f'Maximum projected variance: {variances.max():.4f}')
print(f'Achieved at theta = {theta_vals[variances.argmax()]:.4f} rad')
print(f'PC1 angle = {pc1_angle:.4f} rad')
print(f'Lambda1 (max eigenvalue) = {max_var:.4f}')
print(f'PASS: {np.isclose(variances.max(), max_var, rtol=0.01)}')
if HAS_MPL:
plt.figure(figsize=(9, 4))
plt.plot(np.degrees(theta_vals), variances, 'steelblue', lw=2)
plt.axvline(np.degrees(pc1_angle), color='crimson', ls='--', lw=2, label=f'PC1 ({max_var:.2f})')
plt.xlabel('Direction angle θ (degrees)')
plt.ylabel('Projected variance vᵀCv')
plt.title('Projected Variance as a Function of Direction')
plt.legend()
plt.tight_layout()
plt.show()
Code cell 9
# === 2.2 SVD vs Eigendecomposition — Numerical Comparison ===
np.random.seed(42)
n, d = 100, 20
X = np.random.randn(n, d) @ np.diag(np.arange(d, 0, -1)**0.5) # structured variance
# Method 1: Covariance eigendecomposition (eigh — symmetric solver)
X_c = X - X.mean(axis=0)
C = X_c.T @ X_c / (n - 1)
lambdas_eig, V_eig = np.linalg.eigh(C)
lambdas_eig = lambdas_eig[::-1]
V_eig = V_eig[:, ::-1]
# Method 2: SVD on centered data
U_svd, s_svd, Vt_svd = np.linalg.svd(X_c, full_matrices=False)
lambdas_svd = s_svd**2 / (n - 1)
# Compare
print('Explained variances (first 5):')
print(' Eig :', np.round(lambdas_eig[:5], 6))
print(' SVD :', np.round(lambdas_svd[:5], 6))
print(f'Max difference: {np.abs(lambdas_eig[:5] - lambdas_svd[:5]).max():.2e}')
# Compare direction alignment (absolute dot product)
alignments = [abs(V_eig[:, i] @ Vt_svd[i]) for i in range(5)]
print(f'\nPC direction alignments (should be ~1.0):')
print(np.round(alignments, 6))
# Ill-conditioned case: eig vs eigh
X_ill = np.random.randn(50, 5)
X_ill[:, 1] = X_ill[:, 0] + 1e-10 * np.random.randn(50) # near-duplicate column
C_ill = (X_ill - X_ill.mean(0)).T @ (X_ill - X_ill.mean(0)) / 49
evals_eig = np.linalg.eig(C_ill)[0] # may be complex!
evals_eigh = np.linalg.eigh(C_ill)[0] # always real
print(f'\nIll-conditioned: eig returns complex? {np.any(np.iscomplex(evals_eig))}')
print(f'eigh always real: {np.all(np.isreal(evals_eigh))}')
print('PASS: SVD (via eigh) is numerically stable; use it always')
3. Properties — Decorrelation and Optimality
We verify the key PCA properties: decorrelation of scores, optimal reconstruction (Eckart-Young), and the effect of scale on PCA results.
Code cell 11
# === 3.1 PCA Decorrelates the Scores ===
np.random.seed(42)
n, d = 200, 10
# Highly correlated features
cov_true = 0.8 * np.ones((d, d)) + 0.2 * np.eye(d) # strong correlation
X = np.random.multivariate_normal(np.zeros(d), cov_true, n)
X_c = X - X.mean(axis=0)
U, s, Vt = np.linalg.svd(X_c, full_matrices=False)
Z = U * s # full PCA scores
# Covariance of original data
C_original = np.cov(X.T)
# Covariance of PCA scores
C_scores = np.cov(Z.T)
# Off-diagonal elements
off_diag_original = C_original[np.tril_indices(d, k=-1)]
off_diag_scores = C_scores[np.tril_indices(d, k=-1)]
print(f'Original data — mean |off-diagonal covariance|: {np.abs(off_diag_original).mean():.4f}')
print(f'PCA scores — mean |off-diagonal covariance|: {np.abs(off_diag_scores).mean():.2e}')
print(f'\nPASS: PCA scores are uncorrelated (covariance ~0): {np.abs(off_diag_scores).max() < 1e-10}')
# Eigenvalues of covariance = diagonal of score covariance
lambdas = s**2 / (n - 1)
score_variances = np.var(Z, axis=0, ddof=1)
ok = np.allclose(lambdas, score_variances, rtol=1e-6)
print(f'PASS: Score variances equal eigenvalues: {ok}')
print(f'Lambda values (first 5): {np.round(lambdas[:5], 4)}')
print(f'Score variances (first 5): {np.round(score_variances[:5], 4)}')
Code cell 12
# === 3.2 Eckart-Young: PCA is the Optimal Low-Rank Approximation ===
np.random.seed(42)
n, d = 100, 50
# True low-rank signal + noise
rank_true = 5
L = np.random.randn(n, rank_true)
R = np.random.randn(rank_true, d)
X_signal = L @ R
X_noisy = X_signal + 0.5 * np.random.randn(n, d)
X_c = X_noisy - X_noisy.mean(axis=0)
U, s, Vt = np.linalg.svd(X_c, full_matrices=False)
# Test for k = 1, 2, ..., 10
print('k | PCA recon error | Predicted error (sum discarded σ²) | Match?')
print('-' * 75)
for k in range(1, 11):
# PCA reconstruction
X_recon = U[:, :k] * s[:k] @ Vt[:k]
pca_error = np.linalg.norm(X_c - X_recon, 'fro')**2
predicted_error = (s[k:]**2).sum()
match = np.isclose(pca_error, predicted_error, rtol=1e-8)
print(f'{k:2d} | {pca_error:16.4f} | {predicted_error:38.4f} | {"PASS" if match else "FAIL"}')
print(f'\nPASS: Eckart-Young holds for all k')
print(f'Rank-{rank_true} approximation error: {(s[rank_true:]**2).sum():.4f} (noise only)')
Code cell 13
# === 3.4 Scale Sensitivity ===
np.random.seed(42)
n = 200
# Generate data where both features have equal 'information'
cov_true = np.array([[1.0, 0.8], [0.8, 1.0]]) # equal variance, strong correlation
X_base = np.random.multivariate_normal([0, 0], cov_true, n)
# Scale feature 2 by 100 (mimicking unit change: m → cm)
X_scaled = X_base.copy()
X_scaled[:, 1] *= 100
def first_pc(X):
X_c = X - X.mean(0)
_, _, Vt = np.linalg.svd(X_c, full_matrices=False)
return Vt[0]
pc1_original = first_pc(X_base)
pc1_scaled = first_pc(X_scaled)
pc1_standardized = first_pc((X_scaled - X_scaled.mean(0)) / X_scaled.std(0))
print('PC1 directions:')
print(f' Original (equal scale): {np.round(pc1_original, 4)}')
print(f' Scaled (×100 on feat2): {np.round(pc1_scaled, 4)}')
print(f' Standardized: {np.round(pc1_standardized, 4)}')
print()
print('Feature 2 dominates when unscaled!')
print(f'PC1 feature-2 loading without standardization: {abs(pc1_scaled[1]):.4f}')
print(f'PC1 feature-2 loading with standardization: {abs(pc1_standardized[1]):.4f}')
print('\nConclusion: Standardize when features have different units/scales.')
4. Explained Variance and Component Selection
We compute scree plots, cumulative variance, and compare selection methods.
Code cell 15
# === 4.1 Scree Plots and Variance Analysis ===
import numpy as np
try:
import matplotlib.pyplot as plt
HAS_MPL = True
except ImportError:
HAS_MPL = False
np.random.seed(42)
# Generate data with known structure: first 5 dims carry signal
n, d = 500, 50
# True covariance: first k eigenvalues are large, rest are small
k_true = 5
signal_vars = np.array([20, 15, 10, 7, 4]) # signal eigenvalues
noise_var = 0.5
lambdas_true = np.concatenate([signal_vars, noise_var * np.ones(d - k_true)])
Q = np.linalg.qr(np.random.randn(d, d))[0] # random orthogonal
Sigma = Q @ np.diag(lambdas_true) @ Q.T
X = np.random.multivariate_normal(np.zeros(d), Sigma, n)
# PCA via SVD
X_c = X - X.mean(0)
_, s, _ = np.linalg.svd(X_c, full_matrices=False)
lambdas_est = s**2 / (n - 1)
var_ratio = lambdas_est / lambdas_est.sum()
cumvar = np.cumsum(var_ratio)
print('Component selection by threshold:')
for tau in [0.80, 0.90, 0.95, 0.99]:
k = int(np.searchsorted(cumvar, tau)) + 1
print(f' {tau:.0%} variance: k = {k} components')
# Kaiser criterion (for standardized data)
X_std = X_c / X_c.std(0, ddof=1)
_, s_std, _ = np.linalg.svd(X_std, full_matrices=False)
lambdas_std = s_std**2 / (n - 1)
k_kaiser = (lambdas_std > 1.0).sum()
print(f'\nKaiser criterion: k = {k_kaiser} components (λ > 1)')
print(f'True signal dimension: k = {k_true}')
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
ax = axes[0]
ax.bar(range(1, min(20, d)+1), lambdas_est[:20], color='steelblue', alpha=0.8)
ax.axhline(y=noise_var, color='red', ls='--', label=f'True noise level ({noise_var})')
ax.axvline(x=k_true + 0.5, color='green', ls='--', label=f'True signal dim (k={k_true})')
ax.set_xlabel('Component index')
ax.set_ylabel('Eigenvalue (explained variance)')
ax.set_title('Scree Plot')
ax.legend()
ax = axes[1]
ax.plot(range(1, d+1), cumvar * 100, 'steelblue', lw=2)
for tau, ls in [(0.80, ':'), (0.90, '--'), (0.95, '-'), (0.99, '-')]:
ax.axhline(y=tau*100, color='gray', ls=ls, alpha=0.7, label=f'{tau:.0%}')
ax.axvline(x=k_true, color='green', ls='--', label=f'True k={k_true}')
ax.set_xlabel('Number of components k')
ax.set_ylabel('Cumulative Explained Variance (%)')
ax.set_title('Cumulative EVR')
ax.legend(fontsize=9)
plt.tight_layout()
plt.show()
5. PCA Whitening
Whitening transforms data to have identity covariance. We compare PCA whitening (in PC space) and ZCA whitening (in original feature space).
Code cell 17
# === 5.1 PCA Whitening vs ZCA Whitening ===
import numpy as np
np.random.seed(42)
try:
import matplotlib.pyplot as plt
HAS_MPL = True
except ImportError:
HAS_MPL = False
n = 400
cov = np.array([[9.0, 6.0], [6.0, 5.0]])
X = np.random.multivariate_normal([0, 0], cov, n)
def pca_whiten(X):
X_c = X - X.mean(0)
U, s, Vt = np.linalg.svd(X_c, full_matrices=False)
# White scores: rescale to unit variance
s_reg = np.maximum(s, 1e-10)
return U * np.sqrt(len(X) - 1) # each column has unit variance
def zca_whiten(X):
X_c = X - X.mean(0)
U, s, Vt = np.linalg.svd(X_c, full_matrices=False)
# ZCA: PCA whiten then rotate back
n = len(X)
W_pca = np.diag(1.0 / s) * np.sqrt(n - 1)
Z_pca = X_c @ Vt.T @ np.diag(1.0 / s) * np.sqrt(n - 1)
return Z_pca @ Vt # rotate back to original axes
Z_pca = pca_whiten(X)
Z_zca = zca_whiten(X)
# Verify covariance = I
C_pca = np.cov(Z_pca.T)
C_zca = np.cov(Z_zca.T)
print('PCA-white covariance (should be I):')
print(np.round(C_pca, 3))
print('\nZCA-white covariance (should be I):')
print(np.round(C_zca, 3))
print(f'\nPASS PCA: {np.allclose(C_pca, np.eye(2), atol=0.05)}')
print(f'PASS ZCA: {np.allclose(C_zca, np.eye(2), atol=0.05)}')
if HAS_MPL:
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
datasets = [(X, 'Original data\n(correlated ellipse)'),
(Z_pca, 'PCA whitening\n(rotated circle)'),
(Z_zca, 'ZCA whitening\n(circle, original axes)')]
for ax, (data, title) in zip(axes, datasets):
ax.scatter(data[:, 0], data[:, 1], alpha=0.3, s=10, c='steelblue')
ax.set_aspect('equal')
ax.set_title(title)
lim = max(abs(data).max() * 1.1, 1)
ax.set_xlim(-lim, lim); ax.set_ylim(-lim, lim)
ax.axhline(0, color='gray', lw=0.5); ax.axvline(0, color='gray', lw=0.5)
plt.tight_layout()
plt.show()
6. Probabilistic PCA (PPCA)
PPCA embeds PCA in a latent variable model. The MLE has a closed form, and the noise parameter estimates the average discarded variance.
Code cell 19
# === 6.1 PPCA: Closed-Form MLE ===
import numpy as np
np.random.seed(42)
def ppca_mle(X_c, k):
"""
Probabilistic PCA MLE (Tipping & Bishop 1999).
X_c: centered data (n, d)
k: number of latent dimensions
Returns: W, sigma2, log_likelihood
"""
n, d = X_c.shape
# Eigendecomposition of sample covariance
C = X_c.T @ X_c / n
lambdas, V = np.linalg.eigh(C)
lambdas = lambdas[::-1]; V = V[:, ::-1] # descending order
# MLE for sigma^2: mean of discarded eigenvalues
sigma2 = lambdas[k:].mean() if d > k else 1e-10
# MLE for W
Lambda_k = np.diag(lambdas[:k])
W = V[:, :k] @ np.diag(np.sqrt(np.maximum(lambdas[:k] - sigma2, 0)))
# Log-likelihood at MLE
C_model = W @ W.T + sigma2 * np.eye(d)
sign, logdet = np.linalg.slogdet(C_model)
loglik = -n/2 * (d * np.log(2*np.pi) + logdet
+ np.trace(np.linalg.solve(C_model, C)))
return W, sigma2, loglik
# Generate data from a PPCA model with k=3, d=20
d, k_true, n = 20, 3, 500
W_true = np.random.randn(d, k_true)
sigma2_true = 0.5
Z_latent = np.random.randn(n, k_true)
noise = np.sqrt(sigma2_true) * np.random.randn(n, d)
X = Z_latent @ W_true.T + noise
X_c = X - X.mean(0)
print('PPCA MLE for different k:')
print(f'{"k":>4} | {"sigma2":>10} | {"log-likelihood":>16}')
print('-' * 40)
for k in range(1, 8):
W, s2, loglik = ppca_mle(X_c, k)
marker = ' ← true k' if k == k_true else ''
print(f'{k:4d} | {s2:10.4f} | {loglik:16.2f}{marker}')
# Verify: sigma2 at true k ≈ mean discarded eigenvalue
_, s_svd, _ = np.linalg.svd(X_c, full_matrices=False)
lambdas_all = s_svd**2 / n
sigma2_expected = lambdas_all[k_true:].mean()
_, sigma2_mle, _ = ppca_mle(X_c, k_true)
print(f'\nPASS sigma2: MLE={sigma2_mle:.4f}, expected (discarded mean)={sigma2_expected:.4f}')
print(f'True sigma2 = {sigma2_true}')
7. Kernel PCA
Kernel PCA extends PCA to non-linear structure via the kernel trick. We demonstrate on the two-moons dataset.
Code cell 21
# === 7.1 Kernel PCA vs Linear PCA ===
import numpy as np
np.random.seed(42)
try:
import matplotlib.pyplot as plt
HAS_MPL = True
except ImportError:
HAS_MPL = False
try:
from sklearn.datasets import make_moons
from sklearn.decomposition import PCA, KernelPCA
HAS_SKLEARN = True
except ImportError:
HAS_SKLEARN = False
print('sklearn not available')
if HAS_SKLEARN:
X_moons, y_moons = make_moons(n_samples=300, noise=0.1, random_state=42)
# Linear PCA
pca_lin = PCA(n_components=2)
Z_lin = pca_lin.fit_transform(X_moons)
# Kernel PCA (RBF)
kpca = KernelPCA(n_components=2, kernel='rbf', gamma=1.0)
Z_kpca = kpca.fit_transform(X_moons)
print('Linear PCA: cannot separate two moons (overlapping projections)')
print(f' PC1 variance: {pca_lin.explained_variance_ratio_[0]:.2%}')
if HAS_MPL:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
titles = ['Original two moons', 'Linear PCA (2D)', 'Kernel PCA RBF γ=1 (2D)']
datas = [X_moons, Z_lin, Z_kpca]
for ax, data, title in zip(axes, datas, titles):
for c in [0, 1]:
mask = y_moons == c
ax.scatter(data[mask, 0], data[mask, 1],
label=f'Class {c}', alpha=0.7, s=20)
ax.set_title(title)
ax.legend(fontsize=9)
plt.tight_layout()
plt.show()
print('Kernel PCA separates the two moons — linear PCA cannot!')
else:
print('Skipping kernel PCA demo (sklearn not installed)')
Code cell 22
# === 7.2 Kernel PCA from Scratch ===
import numpy as np
np.random.seed(42)
def rbf_kernel(X, Y, gamma=1.0):
"""RBF kernel matrix between X (n×d) and Y (m×d)."""
# ||x - y||^2 = ||x||^2 - 2x^T y + ||y||^2
X_sq = (X**2).sum(axis=1, keepdims=True)
Y_sq = (Y**2).sum(axis=1, keepdims=True)
sq_dists = X_sq + Y_sq.T - 2 * X @ Y.T
return np.exp(-gamma * sq_dists)
def center_kernel(K):
"""Center the kernel matrix."""
n = K.shape[0]
one_n = np.ones((n, n)) / n
return K - one_n @ K - K @ one_n + one_n @ K @ one_n
def kernel_pca(X, k=2, gamma=1.0):
"""Kernel PCA with RBF kernel."""
n = len(X)
# Compute and center kernel
K = rbf_kernel(X, X, gamma)
K_c = center_kernel(K)
# Eigendecomposition
lambdas, alphas = np.linalg.eigh(K_c)
lambdas = lambdas[::-1]; alphas = alphas[:, ::-1]
# Keep top k; normalize
lam_k = lambdas[:k]
alpha_k = alphas[:, :k] / np.sqrt(np.maximum(lam_k, 1e-10))
# Scores
scores = K_c @ alpha_k
return scores, lam_k, alpha_k, K, K_c.mean(0)
try:
from sklearn.datasets import make_moons
X_moons, y_moons = make_moons(n_samples=200, noise=0.1, random_state=42)
Z_scratch, lam, alpha, K_train, K_row_means = kernel_pca(X_moons, k=2, gamma=1.0)
# Compare with sklearn
from sklearn.decomposition import KernelPCA
kpca_ref = KernelPCA(n_components=2, kernel='rbf', gamma=1.0)
Z_sklearn = kpca_ref.fit_transform(X_moons)
# Check alignment (up to sign and scaling)
align = abs(np.corrcoef(Z_scratch[:, 0], Z_sklearn[:, 0])[0, 1])
print(f'From-scratch vs sklearn alignment on PC1: {align:.6f}')
print(f'PASS: {align > 0.99}')
print(f'\nTop 5 kernel eigenvalues: {np.round(lam[:5], 4)}')
except ImportError:
print('sklearn not available — running kernel PCA on synthetic data')
# Fallback: generate two-circles data manually
t = np.linspace(0, 2*np.pi, 100)
X1 = np.column_stack([np.cos(t), np.sin(t)])
X2 = np.column_stack([2*np.cos(t), 2*np.sin(t)])
X_circles = np.vstack([X1, X2])
Z, lam, _, _, _ = kernel_pca(X_circles, k=2, gamma=0.5)
print(f'Kernel PCA on circles: top eigenvalues {np.round(lam[:3], 3)}')
9. Applications in Machine Learning and AI
We demonstrate key AI applications: intrinsic dimensionality, LoRA rank analysis, and loss landscape structure.
Code cell 24
# === 9.1 Eigenfaces: Image Compression via PCA ===
import numpy as np
np.random.seed(42)
try:
import matplotlib.pyplot as plt
HAS_MPL = True
except ImportError:
HAS_MPL = False
# Simulate face-like data: structured low-rank patches
n_faces, h, w = 100, 16, 16
d = h * w # 256
# Generate 'faces' as low-rank structure + noise
k_face = 10 # true face dimensionality
face_basis = np.random.randn(k_face, d) * 10
face_basis /= np.linalg.norm(face_basis, axis=1, keepdims=True)
coeffs = np.random.randn(n_faces, k_face)
X_faces = coeffs @ face_basis + 3 * np.random.randn(n_faces, d)
# PCA
mean_face = X_faces.mean(0)
X_c = X_faces - mean_face
U, s, Vt = np.linalg.svd(X_c, full_matrices=False)
# Reconstruction quality at different k
print('Reconstruction quality vs k:')
print(f'{"k":>5} | {"EVR":>8} | {"Recon Error":>12} | {"Compression ratio":>18}')
print('-' * 55)
for k in [1, 5, 10, 20, 50]:
X_recon = U[:, :k] * s[:k] @ Vt[:k] + mean_face
error = np.linalg.norm(X_faces - X_recon, 'fro')**2 / np.linalg.norm(X_faces, 'fro')**2
evr = (s[:k]**2).sum() / (s**2).sum()
compression = n_faces * d / (n_faces * k + k * d + d)
print(f'{k:5d} | {evr:8.3%} | {error:12.6f} | {compression:18.1f}x')
if HAS_MPL:
# Show first 6 eigenfaces
fig, axes = plt.subplots(2, 6, figsize=(14, 5))
axes[0, 0].imshow(mean_face.reshape(h, w), cmap='viridis')
axes[0, 0].set_title('Mean face')
axes[0, 0].axis('off')
for i in range(5):
ax = axes[0, i+1]
ax.imshow(Vt[i].reshape(h, w), cmap='RdBu_r')
ax.set_title(f'PC{i+1} ({s[i]**2/(s**2).sum():.1%})')
ax.axis('off')
# Show original and reconstructions for one face
face_idx = 0
for j, k in enumerate([1, 5, 10, 20, 50]):
ax = axes[1, j+1]
recon = U[face_idx, :k] * s[:k] @ Vt[:k] + mean_face
ax.imshow(recon.reshape(h, w), cmap='viridis')
ax.set_title(f'k={k}')
ax.axis('off')
axes[1, 0].imshow(X_faces[face_idx].reshape(h, w), cmap='viridis')
axes[1, 0].set_title('Original')
axes[1, 0].axis('off')
fig.suptitle('Eigenfaces (top row) and Reconstruction Quality (bottom row)')
plt.tight_layout()
plt.show()
Code cell 25
# === 9.2 Intrinsic Dimensionality of LLM-like Representations ===
import numpy as np
np.random.seed(42)
try:
import matplotlib.pyplot as plt
HAS_MPL = True
except ImportError:
HAS_MPL = False
def participation_ratio(X_c):
"""Continuous intrinsic dimensionality measure."""
_, s, _ = np.linalg.svd(X_c, full_matrices=False)
lambdas = s**2
return lambdas.sum()**2 / (lambdas**2).sum()
def k_for_evr(X_c, tau=0.90):
"""Number of components for tau cumulative EVR."""
_, s, _ = np.linalg.svd(X_c, full_matrices=False)
var_ratio = s**2 / (s**2).sum()
cumvar = np.cumsum(var_ratio)
return int(np.searchsorted(cumvar, tau)) + 1
# Simulate LLM activations: low intrinsic dim in d=512
d_model = 512
n_tokens = 1000
# Simulate multiple 'layers' with increasing then decreasing intrinsic dim
n_layers = 12
results = []
for layer in range(n_layers):
# Intrinsic dim increases in early layers, peaks mid, decreases late
k_eff = int(20 + 30 * np.sin(np.pi * layer / (n_layers - 1)))
# Generate activations with k_eff effective dimensions
signal_vars = np.exp(-np.arange(k_eff) / (k_eff / 3))
noise_var = 0.01
Q = np.linalg.qr(np.random.randn(d_model, d_model))[0]
lambdas_l = np.concatenate([signal_vars * 10,
noise_var * np.ones(d_model - k_eff)])
X_layer = np.random.randn(n_tokens, d_model) @ np.diag(np.sqrt(lambdas_l)) @ Q[:d_model].T
X_c_l = X_layer - X_layer.mean(0)
pr = participation_ratio(X_c_l)
k_90 = k_for_evr(X_c_l, tau=0.90)
results.append({'layer': layer, 'PR': pr, 'k_90': k_90, 'k_true': k_eff})
print(f'Layer | True k | PR | k@90%EVR')
print('-' * 40)
for r in results:
print(f"{r['layer']:5d} | {r['k_true']:6d} | {r['PR']:6.1f} | {r['k_90']:9d}")
if HAS_MPL:
layers = [r['layer'] for r in results]
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(layers, [r['PR'] for r in results], 'o-', label='Participation Ratio', lw=2)
ax.plot(layers, [r['k_90'] for r in results], 's--', label='k at 90% EVR', lw=2)
ax.plot(layers, [r['k_true'] for r in results], '^:', label='True signal dim', lw=2)
ax.set_xlabel('Layer index')
ax.set_ylabel('Intrinsic dimensionality')
ax.set_title('Intrinsic Dimensionality Across Simulated LLM Layers')
ax.legend()
plt.tight_layout()
plt.show()
Code cell 26
# === 9.4 LoRA: Low-Rank Hypothesis Verification ===
import numpy as np
np.random.seed(42)
try:
import matplotlib.pyplot as plt
HAS_MPL = True
except ImportError:
HAS_MPL = False
# Simulate a fine-tuning weight update with known low rank
d_out, d_in = 256, 128
r_true = 8 # true rank of delta W
# Generate true low-rank delta W
U_true = np.random.randn(d_out, r_true)
V_true = np.random.randn(d_in, r_true)
scale = np.exp(-np.arange(r_true) * 0.5) # decaying singular values
delta_W_true = (U_true * scale) @ V_true.T / r_true
# Add small noise to simulate real fine-tuning
noise_level = 0.01
delta_W = delta_W_true + noise_level * np.random.randn(d_out, d_in)
# SVD of delta W
U_s, s, Vt_s = np.linalg.svd(delta_W, full_matrices=False)
print('Singular value spectrum of delta_W:')
print(f'First 15 singular values: {np.round(s[:15], 4)}')
print(f'Noise floor (expected ~{noise_level:.2f}): {np.round(s[r_true:r_true+5], 4)}')
# LoRA reconstruction for different ranks
print(f'\nLoRA reconstruction error vs optimal (Eckart-Young) error:')
print(f'{"r":>5} | {"EY error":>12} | {"Params (LoRA)":>15} | {"Compression":>12}')
print('-' * 55)
for r in [1, 2, 4, 8, 16, 32]:
ey_error = ((s[r:]**2).sum())**0.5 # Eckart-Young
params_lora = r * (d_out + d_in) # LoRA params
params_full = d_out * d_in # Full delta W
compression = params_full / params_lora
marker = ' ← true rank' if r == r_true else ''
print(f'{r:5d} | {ey_error:12.6f} | {params_lora:15,d} | {compression:11.1f}x{marker}')
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
ax = axes[0]
ax.semilogy(range(1, len(s)+1), s, 'o-', ms=4, lw=1.5, c='steelblue')
ax.axvline(r_true + 0.5, color='red', ls='--', label=f'True rank r={r_true}')
ax.axhline(noise_level, color='green', ls=':', label=f'Noise level {noise_level}')
ax.set_xlabel('Singular value index')
ax.set_ylabel('Singular value (log scale)')
ax.set_title('Singular Value Spectrum of ΔW')
ax.legend()
ax = axes[1]
r_vals = range(1, 33)
errors = [((s[r:]**2).sum())**0.5 for r in r_vals]
ax.plot(r_vals, errors, 'o-', c='steelblue', lw=2)
ax.axvline(r_true, color='red', ls='--', label=f'True rank r={r_true}')
ax.set_xlabel('LoRA rank r')
ax.set_ylabel('Reconstruction error ||ΔW - ΔWᵣ||_F')
ax.set_title('LoRA Reconstruction Error (= Eckart-Young error)')
ax.legend()
plt.tight_layout()
plt.show()
print(f'\nTakeaway: LoRA with r={r_true} achieves near-zero error (noise floor only).')
print(f'Compression: {d_out*d_in / (r_true*(d_out+d_in)):.1f}x vs storing full ΔW')
Summary
This notebook demonstrated:
- Geometry — PCA finds the axes of maximum variance (the covariance ellipse)
- Derivation — Variance maximization leads to the eigenvalue equation
- SVD connection — SVD of centered data gives the same result, more stably
- Decorrelation — PCA scores have diagonal covariance:
- Eckart-Young — PCA is the optimal low-rank approximation (verified numerically)
- Whitening — PCA- and ZCA-whitening produce identity covariance, in different frames
- PPCA — Probabilistic model recovers PCA at zero noise; = mean discarded eigenvalue
- Kernel PCA — Non-linear structure detected via kernel trick; centering is critical
- AI applications — Intrinsic dimension, LoRA rank analysis, eigenfaces
Next: Linear Transformations →