Theory Notebook
Converted from
theory.ipynbfor web reading.
Matrix Decompositions
"The purpose of computing is insight, not numbers — and no insight is more powerful than factoring a matrix into pieces whose structure you understand."
This notebook implements and explores all key algorithms from §08: LU with pivoting, Householder QR, Givens rotations, rank-revealing QR, blocked algorithms, GP inference via Cholesky, and randomized methods.
Run cells top-to-bottom for best results.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import numpy as np
import scipy.linalg as la
from scipy import stats
try:
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style='whitegrid', palette='colorblind')
HAS_SNS = True
except ImportError:
plt.style.use('seaborn-v0_8-whitegrid')
HAS_SNS = False
mpl.rcParams.update({
'figure.figsize': (10, 6),
'figure.dpi': 120,
'font.size': 13,
'axes.titlesize': 15,
'axes.labelsize': 13,
'lines.linewidth': 2.0,
'axes.spines.top': False,
'axes.spines.right': False,
})
HAS_MPL = True
except ImportError:
HAS_MPL = False
COLORS = {
'primary': '#0077BB',
'secondary': '#EE7733',
'tertiary': '#009988',
'error': '#CC3311',
'neutral': '#555555',
'highlight': '#EE3377',
}
np.set_printoptions(precision=6, suppress=True)
np.random.seed(42)
print('Setup complete. Libraries loaded.')
1. Triangular Systems
Triangular systems are the 'solved case' in linear algebra: forward substitution solves in , backward substitution solves in . All factorizations reduce to triangular solves.
Code cell 5
# === 1.1 Forward and Backward Substitution ===
def forward_sub(L, b):
"""Solve Lx=b for unit lower triangular L (no division)."""
n = len(b)
x = b.copy().astype(float)
for i in range(1, n):
x[i] -= L[i, :i] @ x[:i]
return x
def backward_sub(U, b):
"""Solve Ux=b for upper triangular U."""
n = len(b)
x = b.copy().astype(float)
for i in range(n-1, -1, -1):
x[i] -= U[i, i+1:] @ x[i+1:]
x[i] /= U[i, i]
return x
# Test on a 5x5 system
np.random.seed(42)
L = np.tril(np.random.randn(5, 5))
np.fill_diagonal(L, 1.0) # unit lower triangular
U = np.triu(np.random.randn(5, 5))
b = np.random.randn(5)
x_fwd = forward_sub(L, b)
x_bwd = backward_sub(U, b)
x_fwd_ref = la.solve_triangular(L, b, lower=True, unit_diagonal=True)
x_bwd_ref = la.solve_triangular(U, b, lower=False)
ok1 = np.allclose(x_fwd, x_fwd_ref)
ok2 = np.allclose(x_bwd, x_bwd_ref)
print(f"{'PASS' if ok1 else 'FAIL'} — Forward substitution matches scipy")
print(f"{'PASS' if ok2 else 'FAIL'} — Backward substitution matches scipy")
# Residuals
res_fwd = np.linalg.norm(L @ x_fwd - b) / np.linalg.norm(b)
res_bwd = np.linalg.norm(U @ x_bwd - b) / np.linalg.norm(b)
print(f"Forward sub residual: {res_fwd:.2e} (expect O(u) = O(1e-16))")
print(f"Backward sub residual: {res_bwd:.2e}")
2. LU Factorization
LU factorization decomposes into lower and upper triangular factors. Pivoting is essential — without it, small pivots cause catastrophic cancellation.
Code cell 7
# === 2.1 Naive LU Without Pivoting — Demonstrating Failure ===
def lu_naive(A):
"""Gaussian elimination WITHOUT pivoting. Returns L, U (may fail)."""
n = A.shape[0]
A = A.copy().astype(float)
L = np.eye(n)
for k in range(n-1):
if abs(A[k, k]) < 1e-300:
raise ValueError(f'Zero pivot at step {k}')
L[k+1:, k] = A[k+1:, k] / A[k, k] # multipliers
A[k+1:, :] -= np.outer(L[k+1:, k], A[k, :])
return L, np.triu(A)
# Example 1: Works fine for a nice matrix
A_nice = np.array([[3., 1., 2.],
[6., 4., 5.],
[9., 8., 10.]])
L_nice, U_nice = lu_naive(A_nice)
err_nice = np.linalg.norm(A_nice - L_nice @ U_nice)
print(f'Naive LU on nice matrix: ||A - LU|| = {err_nice:.2e}')
# Example 2: Catastrophic failure with tiny pivot
eps = 1e-16
A_bad = np.array([[eps, 1.], [1., 2.]])
b_bad = np.array([1., 3.])
# True solution (well-conditioned): x ≈ (1, 1)
x_true = np.linalg.solve(A_bad, b_bad)
print(f'True solution: {x_true}')
# Naive LU solution
L_bad, U_bad = lu_naive(A_bad)
y = forward_sub(L_bad, b_bad)
x_naive = backward_sub(U_bad, y)
print(f'Naive LU solution: {x_naive}')
print(f'Relative error: {np.linalg.norm(x_naive - x_true)/np.linalg.norm(x_true):.2e}')
print(f'Growth factor rho_2 = {abs(U_bad[1,1]) / abs(A_bad).max():.2e}')
Code cell 8
# === 2.2 LU With Partial Pivoting — PA = LU ===
def lu_pivot(A):
"""LU with partial pivoting. Returns P (perm array), L, U."""
n = A.shape[0]
A = A.copy().astype(float)
piv = np.arange(n)
L = np.zeros((n, n))
np.fill_diagonal(L, 1.0)
for k in range(n-1):
# Find pivot
p = k + np.argmax(np.abs(A[k:, k]))
# Swap rows
A[[k, p]] = A[[p, k]]
L[[k, p], :k] = L[[p, k], :k]
piv[[k, p]] = piv[[p, k]]
# Compute multipliers
if abs(A[k, k]) < 1e-14:
continue
L[k+1:, k] = A[k+1:, k] / A[k, k]
A[k+1:] -= np.outer(L[k+1:, k], A[k])
U = np.triu(A)
return piv, L, U
# Test on the 'bad' matrix
piv, L_p, U_p = lu_pivot(A_bad)
P_mat = np.eye(2)[piv] # permutation matrix
err_PA_LU = np.linalg.norm(P_mat @ A_bad - L_p @ U_p)
print(f'Partial pivot: ||PA - LU|| = {err_PA_LU:.2e}')
# Solve using pivoted LU
b_perm = b_bad[piv]
y = forward_sub(L_p, b_perm)
x_piv = backward_sub(U_p, y)
print(f'Pivoted LU solution: {x_piv}')
print(f'Relative error: {np.linalg.norm(x_piv - x_true)/np.linalg.norm(x_true):.2e}')
# Verify on a larger random system
n = 50
A_rand = np.random.randn(n, n)
b_rand = np.random.randn(n)
piv_r, L_r, U_r = lu_pivot(A_rand)
P_r = np.eye(n)[piv_r]
err_factorization = np.linalg.norm(P_r @ A_rand - L_r @ U_r, 'fro')
print(f'||PA - LU||_F for n=50 random: {err_factorization:.2e}')
# Compare with scipy
P_sci, L_sci, U_sci = la.lu(A_rand)
x_ref = np.linalg.solve(A_rand, b_rand)
b_p = b_rand[piv_r]
y_p = la.solve_triangular(L_r, b_p, lower=True, unit_diagonal=True)
x_p = la.solve_triangular(U_r, y_p, lower=False)
print(f'Solution relative error vs numpy: {np.linalg.norm(x_p - x_ref)/np.linalg.norm(x_ref):.2e}')
Code cell 9
# === 2.3 Growth Factor Analysis ===
def growth_factor(A):
"""Compute growth factor for LU with partial pivoting."""
P, L, U = la.lu(A)
# Reconstruct intermediate matrices by stepping through elimination
max_orig = np.abs(A).max()
# Growth factor = max entry in any intermediate / max in original
# Approximated via scipy's internally computed LU
return np.abs(U).max() / max_orig
# Random matrices of various sizes
sizes = [10, 20, 50, 100, 200]
gf_random = []
for n in sizes:
rhos = [growth_factor(np.random.randn(n, n)) for _ in range(10)]
gf_random.append(np.mean(rhos))
print(f'n={n}: avg growth factor = {np.mean(rhos):.2f}, max = {np.max(rhos):.2f}')
if HAS_MPL:
fig, ax = plt.subplots(figsize=(9, 5))
ax.plot(sizes, gf_random, 'o-', color=COLORS['primary'], label='Random matrix (avg)')
ax.plot(sizes, [n**(2/3) for n in sizes], '--', color=COLORS['secondary'],
label=r'$n^{2/3}$ (empirical bound)')
ax.plot(sizes, [2**(n-1) for n in sizes], ':', color=COLORS['error'],
label=r'$2^{n-1}$ (worst case)')
ax.set_yscale('log')
ax.set_xlabel('Matrix size $n$')
ax.set_ylabel('Growth factor $\\rho_n$')
ax.set_title('LU growth factor: random vs theoretical bounds')
ax.legend()
fig.tight_layout()
plt.show()
print('Growth factor plot shown.')
Code cell 10
# === 2.4 Determinant via LU ===
def det_via_lu(A):
"""Compute det(A) = (-1)^s * prod(diag(U)) from partial-pivot LU."""
import scipy.linalg as la
# scipy.lu returns P, L, U with P @ A = L @ U
P, L, U = la.lu(A)
# Number of row swaps = n - rank of permutation
n = A.shape[0]
# det(P) is +1 or -1 based on parity of the permutation
det_P = np.linalg.det(P) # efficient: just sign
det_U = np.prod(np.diag(U))
return det_P * det_U
np.random.seed(42)
for n in [3, 5, 8, 10]:
A_test = np.random.randn(n, n)
det_lu = det_via_lu(A_test)
det_np = np.linalg.det(A_test)
rel_err = abs(det_lu - det_np) / abs(det_np)
print(f'n={n}: det(LU)={det_lu:.6f}, det(numpy)={det_np:.6f}, rel_err={rel_err:.2e}')
# Special case: singular matrix
A_sing = np.array([[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]])
print(f'Singular matrix det = {det_via_lu(A_sing):.2e} (expect 0)')
3. QR Factorization: Householder Algorithm
Householder reflectors are orthogonal matrices that zero out subdiagonal entries. They are unconditionally backward stable — the orthogonality error regardless of condition number.
Code cell 12
# === 3.1 Householder Reflection Construction ===
def householder_vec(a):
"""Compute Householder vector v such that H @ a = alpha * e_1.
Sign convention: alpha = -sign(a[0]) * norm(a) to avoid cancellation."""
n = len(a)
sigma = np.linalg.norm(a)
if sigma == 0:
return np.zeros(n), 0.0
alpha = -np.sign(a[0]) * sigma
v = a.copy().astype(float)
v[0] -= alpha # v = a - alpha * e_1
beta = 2.0 / (v @ v) # 2 / ||v||^2
return v, beta
def householder_apply(v, beta, C):
"""Apply H = I - beta*v*v^T to C implicitly. O(len(v) * C.shape[1])."""
w = beta * (v @ C) # row vector of inner products
return C - np.outer(v, w)
# Test: H @ a should equal alpha * e_1
a = np.array([3., 1., 4., 1., 5.])
v, beta = householder_vec(a)
Ha = a - beta * v * (v @ a)
alpha_expected = -np.sign(a[0]) * np.linalg.norm(a)
expected = np.zeros_like(a)
expected[0] = alpha_expected
ok = np.allclose(Ha, expected, atol=1e-14)
print(f"{'PASS' if ok else 'FAIL'} — H @ a = alpha * e_1")
print(f'Ha = {Ha}')
print(f'Expected: {expected}')
print(f'Entries 1-4 are zero: {np.allclose(Ha[1:], 0)}')
# Verify H is orthogonal (implicitly)
n = 5
H_explicit = np.eye(n) - beta * np.outer(v, v)
ok_orth = np.allclose(H_explicit @ H_explicit.T, np.eye(n))
print(f"{'PASS' if ok_orth else 'FAIL'} — H is orthogonal")
Code cell 13
# === 3.2 Sign Convention — Avoiding Cancellation ===
# Show why we use alpha = -sign(a[0])*norm(a)
a_pos = np.array([10.0, 1.0, 1.0]) # a[0] positive
a_neg = np.array([-10.0, 1.0, 1.0]) # a[0] negative
print('Case: a[0] > 0')
print(f' Good sign: v[0] = a[0] - (-norm(a)) = {a_pos[0] + np.linalg.norm(a_pos):.4f}')
print(f' Bad sign: v[0] = a[0] - (+norm(a)) = {a_pos[0] - np.linalg.norm(a_pos):.6f} <- cancellation!')
print('\nCase: a[0] < 0')
print(f' Good sign: v[0] = a[0] - (+norm(a)) = {a_neg[0] + np.linalg.norm(a_neg):.6f} <- cancellation!')
print(f' Bad sign: v[0] = a[0] - (-norm(a)) = {a_neg[0] - (-np.linalg.norm(a_neg)):.4f}')
# Quantify the cancellation error for small a[0]
errors_good = []
errors_bad = []
eps_vals = np.logspace(-16, 0, 100)
for eps in eps_vals:
a = np.array([eps, 1.0]) # a[0] = eps (small)
norm_a = np.linalg.norm(a)
# Good: alpha = -norm (since a[0] > 0)
v_good = a[0] - (-norm_a) # = eps + norm_a, no cancellation
# Bad: alpha = +norm
v_bad = a[0] - norm_a # = eps - norm_a ≈ -1, catastrophic if eps << 1
errors_good.append(abs(v_good - (eps + norm_a)) / max(abs(eps + norm_a), 1e-300))
errors_bad.append(abs(v_bad - (eps - norm_a)) / max(abs(eps - norm_a), 1e-300))
if HAS_MPL:
fig, ax = plt.subplots(figsize=(9, 5))
ax.semilogx(eps_vals, errors_good, color=COLORS['tertiary'], label='Good sign (no cancellation)')
ax.semilogx(eps_vals, errors_bad, color=COLORS['error'], label='Bad sign (cancellation)')
ax.set_xlabel('$a_1$ (size of first component)')
ax.set_ylabel('Relative error in $v_1$')
ax.set_title('Sign convention in Householder: avoiding cancellation')
ax.legend()
fig.tight_layout()
plt.show()
Code cell 14
# === 3.3 Householder QR Algorithm ===
def householder_qr(A):
"""Compute thin QR decomposition using Householder reflectors.
Returns Q (m x n) and R (n x n)."""
m, n = A.shape
A = A.copy().astype(float)
Q = np.eye(m, n) # accumulate Q implicitly
reflectors = []
for k in range(n):
# Extract column k below diagonal
a = A[k:, k]
v, beta = householder_vec(a)
reflectors.append((k, v, beta))
# Apply H_k to A[k:, k:]
A[k:, k:] = householder_apply(v, beta, A[k:, k:])
# Build Q by applying reflectors in reverse
Q = np.eye(m, n)
for k, v, beta in reversed(reflectors):
Q[k:, k:] = householder_apply(v, beta, Q[k:, k:])
R = np.triu(A[:n, :])
return Q, R
# Test on a 6x4 matrix
np.random.seed(42)
m, n = 6, 4
A_test = np.random.randn(m, n)
Q_h, R_h = householder_qr(A_test)
# Verify
err_A = np.linalg.norm(A_test - Q_h @ R_h, 'fro')
err_orth = np.linalg.norm(Q_h.T @ Q_h - np.eye(n), 'fro')
print(f'||A - QR||_F = {err_A:.2e} (expect O(u))')
print(f'||Q^T Q - I||_F = {err_orth:.2e} (expect O(u))')
ok_A = err_A < 1e-12
ok_orth = err_orth < 1e-12
print(f"{'PASS' if ok_A else 'FAIL'} — Factorization accuracy")
print(f"{'PASS' if ok_orth else 'FAIL'} — Orthogonality")
# Compare R to scipy (signs may differ)
Q_sci, R_sci = la.qr(A_test, mode='economic')
# Adjust signs
signs = np.sign(np.diag(R_sci)) * np.sign(np.diag(R_h))
R_h_signed = R_h * signs[:, None]
print(f'||R_householder - R_scipy||_F = {np.linalg.norm(np.abs(R_h) - np.abs(R_sci), "fro"):.2e}')
Code cell 15
# === 3.4 Classical Gram-Schmidt vs Householder Orthogonality ===
def gram_schmidt_classical(A):
"""Classical Gram-Schmidt (numerically unstable)."""
m, n = A.shape
Q = np.zeros((m, n))
R = np.zeros((n, n))
for j in range(n):
v = A[:, j].astype(float)
for i in range(j):
R[i, j] = Q[:, i] @ A[:, j]
v -= R[i, j] * Q[:, i]
R[j, j] = np.linalg.norm(v)
Q[:, j] = v / R[j, j]
return Q, R
# Build ill-conditioned matrix: use Vandermonde-like construction
# Hilbert matrix is a classic ill-conditioned case
n_test = 12
H_hilb = np.array([[1.0/(i+j+1) for j in range(n_test)] for i in range(n_test)])
kappa = np.linalg.cond(H_hilb)
print(f'Condition number of {n_test}x{n_test} Hilbert matrix: {kappa:.2e}')
Q_cgs, _ = gram_schmidt_classical(H_hilb)
Q_house, _ = householder_qr(H_hilb)
orth_cgs = np.linalg.norm(Q_cgs.T @ Q_cgs - np.eye(n_test), 'fro')
orth_house = np.linalg.norm(Q_house.T @ Q_house - np.eye(n_test), 'fro')
print(f'Classical GS orthogonality error: {orth_cgs:.2e}')
print(f'Householder orthogonality error: {orth_house:.2e}')
print(f'Ratio: {orth_cgs / orth_house:.1f}x worse for classical GS')
4. Givens Rotations
A Givens rotation zeros out a specific entry by rotating in the plane. Unlike Householder (which zeros an entire column), Givens touches only two rows — ideal for sparse matrices and streaming updates.
Code cell 17
# === 4.1 Givens Rotation Construction and Application ===
def givens_rotation(a, b):
"""Compute c, s such that [c s; -s c] [a; b] = [r; 0].
Returns c, s, r. Numerically stable (LAPACK dlartg convention)."""
if b == 0:
c, s, r = 1.0, 0.0, a
elif abs(b) > abs(a):
t = -a / b
s = 1.0 / np.sqrt(1 + t**2)
c = s * t
r = b / s
else:
t = -b / a
c = 1.0 / np.sqrt(1 + t**2)
s = c * t
r = a / c
return c, s, r
def apply_givens_left(G_params, A, i, j):
"""Apply Givens G(i,j) on the left to A: A[[i,j], :] updated."""
c, s = G_params
A = A.copy()
row_i = c * A[i, :] - s * A[j, :]
row_j = s * A[i, :] + c * A[j, :]
A[i, :] = row_i
A[j, :] = row_j
return A
# Test: zero out entry (1,0) in a 3x3 matrix
A_g = np.array([[2., 1., 3.],
[1., 4., 2.],
[3., 2., 5.]])
# Zero out A[1,0] using G(0,1)
c, s, r = givens_rotation(A_g[0,0], A_g[1,0])
print(f'Givens: c={c:.4f}, s={s:.4f}, r={r:.4f}')
A_after = apply_givens_left((c, -s), A_g, 0, 1) # G rotates by (c,-s,s,c)
# Verify
print(f'A[1,0] after rotation: {A_after[1,0]:.2e} (should be ~0)')
print(f'A[0,0] after rotation: {A_after[0,0]:.4f} (should be r={r:.4f})')
# Check orthogonality preserved (Frobenius norm)
G_mat = np.array([[c, -s, 0], [s, c, 0], [0, 0, 1.]])
ok = np.allclose(G_mat @ A_g, A_after)
print(f"{'PASS' if ok else 'FAIL'} — Givens rotation applied correctly")
Code cell 18
# === 4.2 Givens QR for Dense and Banded Matrices ===
def givens_qr(A):
"""QR via Givens rotations. Returns Q (m x m) and R (m x n)."""
m, n = A.shape
A = A.copy().astype(float)
Q = np.eye(m)
for j in range(n):
for i in range(m-1, j, -1): # eliminate A[i,j]
c, s, r = givens_rotation(A[i-1, j], A[i, j])
# Update A rows i-1, i
row_prev = c * A[i-1, :] - s * A[i, :]
row_curr = s * A[i-1, :] + c * A[i, :]
A[i-1, :] = row_prev
A[i, :] = row_curr
# Update Q
col_prev = c * Q[:, i-1] - s * Q[:, i]
col_curr = s * Q[:, i-1] + c * Q[:, i]
Q[:, i-1] = col_prev
Q[:, i] = col_curr
return Q, A # A is now upper triangular = R
# Test
np.random.seed(42)
A_test = np.random.randn(5, 3)
Q_g, R_g = givens_qr(A_test)
err_A = np.linalg.norm(A_test - Q_g @ R_g, 'fro')
err_orth = np.linalg.norm(Q_g.T @ Q_g - np.eye(5), 'fro')
print(f'Givens QR: ||A - QR||_F = {err_A:.2e}')
print(f'Givens QR: ||Q^TQ - I||_F = {err_orth:.2e}')
print(f"{'PASS' if err_A < 1e-12 else 'FAIL'} — Givens QR accurate")
# Compare vs Householder for same matrix
try:
from build_decomp_theory_p1 import householder_qr # may not be importable
except:
pass
Q_h_ref, R_h_ref = np.linalg.qr(A_test)
print(f'Max |R_givens| vs |R_numpy|: {np.max(np.abs(np.abs(R_g[:3]) - np.abs(R_h_ref))):.2e}')
5. Rank-Revealing QR (RRQR)
Column-pivoted QR reorders columns to expose rank structure: with . The diagonal of reveals the numerical rank — where the pivots drop.
Code cell 20
# === 5.1 Column-Pivoted QR for Rank Estimation ===
# scipy.linalg.qr with pivoting=True
np.random.seed(42)
# Build a rank-4 matrix: A = U @ S @ V^T
m_r, n_r = 20, 8
U_r = np.linalg.qr(np.random.randn(m_r, m_r))[0][:, :m_r]
V_r = np.linalg.qr(np.random.randn(n_r, n_r))[0]
# True singular values: 100, 10, 1, 0.1, 0.001, 0.001, 0.001, 0.001
sv = np.array([100., 10., 1., 0.1, 0.001, 0.001, 0.001, 0.001])
S_r = np.zeros((m_r, n_r))
np.fill_diagonal(S_r, sv[:n_r])
A_rank4 = U_r @ S_r @ V_r.T
# Column-pivoted QR
import scipy.linalg as la
Q_rrqr, R_rrqr, P_rrqr = la.qr(A_rank4, pivoting=True)
R_diag = np.abs(np.diag(R_rrqr))
print('Diagonal of |R| from RRQR:')
for i, rd in enumerate(R_diag):
ratio = rd / R_diag[0]
print(f' |R[{i},{i}]| = {rd:.4f}, ratio = {ratio:.2e}')
# Rank estimation: where does the ratio drop?
tau = 1e-3
rank_est = np.sum(R_diag / R_diag[0] > tau)
true_sv = np.linalg.svd(A_rank4, compute_uv=False)
rank_svd = np.sum(true_sv / true_sv[0] > tau)
print(f'\nEstimated rank (RRQR, tau={tau}): {rank_est}')
print(f'True rank (SVD, tau={tau}): {rank_svd}')
Code cell 21
# === 5.2 Visualizing RRQR Rank Revelation ===
import numpy as np, scipy.linalg as la
HAS_MPL = True
try:
import matplotlib.pyplot as plt, matplotlib as mpl
except:
HAS_MPL = False
COLORS = {'primary': '#0077BB', 'secondary': '#EE7733', 'tertiary': '#009988',
'error': '#CC3311', 'neutral': '#555555', 'highlight': '#EE3377'}
if HAS_MPL:
true_sv = np.linalg.svd(A_rank4, compute_uv=False)
R_diag2 = np.abs(np.diag(R_rrqr))
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
ax = axes[0]
ax.semilogy(range(1, len(true_sv)+1), true_sv, 'o-',
color=COLORS['primary'], label='True singular values')
ax.semilogy(range(1, len(R_diag2)+1), R_diag2, 's--',
color=COLORS['secondary'], label='|diag(R)| from RRQR')
ax.axhline(tau * true_sv[0], color=COLORS['neutral'], linestyle=':',
label=f'Threshold (tau={tau})')
ax.set_xlabel('Index $k$')
ax.set_ylabel('Value')
ax.set_title('Rank revelation: RRQR vs SVD')
ax.legend(fontsize=10)
ax2 = axes[1]
# Show original matrix and reordered matrix
im = ax2.imshow(np.abs(R_rrqr), cmap='viridis', aspect='auto')
plt.colorbar(im, ax=ax2, label='|R|')
ax2.set_title('|R| from column-pivoted QR')
ax2.set_xlabel('Column')
ax2.set_ylabel('Row')
fig.tight_layout()
plt.show()
print('RRQR visualization shown.')
6. Cholesky Factorization (Computational)
Brief computational recap — full theory in §07. Key points: twice as fast as LU, works only for SPD matrices, unconditionally backward stable, .
Code cell 23
# === 6.1 Cholesky Algorithm from Scratch ===
def cholesky_scratch(A):
"""Compute Cholesky L such that A = L L^T. Raises if not SPD."""
n = A.shape[0]
L = np.zeros_like(A, dtype=float)
for j in range(n):
# Diagonal entry
diag_sq = A[j, j] - L[j, :j] @ L[j, :j]
if diag_sq <= 0:
raise ValueError(f'Matrix not positive definite at step {j}, diag_sq={diag_sq}')
L[j, j] = np.sqrt(diag_sq)
# Subdiagonal entries
if j + 1 < n:
L[j+1:, j] = (A[j+1:, j] - L[j+1:, :j] @ L[j, :j]) / L[j, j]
return L
# Test on an SPD matrix
np.random.seed(42)
n = 6
X = np.random.randn(n, n+3)
A_spd = X @ X.T / (n+3) + np.eye(n) * 0.1 # guaranteed SPD
L_scratch = cholesky_scratch(A_spd)
L_scipy = np.linalg.cholesky(A_spd)
err_factor = np.linalg.norm(A_spd - L_scratch @ L_scratch.T, 'fro')
err_vs_scipy = np.linalg.norm(L_scratch - L_scipy, 'fro')
print(f'||A - LL^T||_F = {err_factor:.2e}')
print(f'||L_scratch - L_scipy||_F = {err_vs_scipy:.2e}')
print(f"{'PASS' if err_factor < 1e-12 else 'FAIL'} — Cholesky factorization")
# Log-determinant via Cholesky
log_det_chol = 2 * np.sum(np.log(np.diag(L_scratch)))
log_det_np = np.log(np.linalg.det(A_spd))
print(f'log det via Cholesky: {log_det_chol:.6f}')
print(f'log det via numpy: {log_det_np:.6f}')
print(f"{'PASS' if abs(log_det_chol - log_det_np) < 1e-10 else 'FAIL'} — Log-det accuracy")
Code cell 24
# === 6.2 Cholesky vs LU: Speed and Operation Count ===
import time
sizes_perf = [50, 100, 200, 500]
times_lu = []
times_chol = []
for n in sizes_perf:
# Build SPD matrix
X = np.random.randn(n, n+5)
A_spd = X @ X.T / (n+5) + np.eye(n)
# Time LU
t0 = time.perf_counter()
for _ in range(10):
la.lu_factor(A_spd)
t_lu = (time.perf_counter() - t0) / 10
# Time Cholesky
t0 = time.perf_counter()
for _ in range(10):
la.cholesky(A_spd)
t_chol = (time.perf_counter() - t0) / 10
times_lu.append(t_lu * 1e3)
times_chol.append(t_chol * 1e3)
speedup = t_lu / t_chol
print(f'n={n:4d}: LU={t_lu*1e3:.2f}ms, Chol={t_chol*1e3:.2f}ms, speedup={speedup:.1f}x')
if HAS_MPL:
fig, ax = plt.subplots(figsize=(9, 5))
ax.plot(sizes_perf, times_lu, 'o-', color=COLORS['primary'], label='LU (DGETRF)')
ax.plot(sizes_perf, times_chol, 's--', color=COLORS['secondary'], label='Cholesky (DPOTRF)')
ax.set_xlabel('Matrix size $n$')
ax.set_ylabel('Time (ms)')
ax.set_title('LU vs Cholesky factorization time (SPD matrices)')
ax.legend()
fig.tight_layout()
plt.show()
7. Numerical Stability and Backward Error
The backward error theorem says: error = condition number × backward error. Backward-stable algorithms minimize backward error to ; the condition number then determines forward error.
Code cell 26
# === 7.1 Stability Comparison: LU vs QR for Ill-Conditioned Systems ===
import numpy as np, scipy.linalg as la
def build_ill_conditioned(n, kappa_target):
"""Build a matrix with condition number approximately kappa_target."""
U, _ = np.linalg.qr(np.random.randn(n, n))
V, _ = np.linalg.qr(np.random.randn(n, n))
sv = np.logspace(0, -np.log10(kappa_target), n)
return U @ np.diag(sv) @ V.T
np.random.seed(42)
results = []
for kappa in [1e2, 1e6, 1e10, 1e13]:
A = build_ill_conditioned(20, kappa)
x_true = np.ones(20) # known true solution
b = A @ x_true
# LU solve
x_lu = np.linalg.solve(A, b) # uses LU internally
err_lu = np.linalg.norm(x_lu - x_true) / np.linalg.norm(x_true)
# QR solve (via lstsq — Householder QR internally)
x_qr, _, _, _ = np.linalg.lstsq(A, b, rcond=None)
err_qr = np.linalg.norm(x_qr - x_true) / np.linalg.norm(x_true)
kappa_actual = np.linalg.cond(A)
print(f'kappa={kappa:.0e}: err_LU={err_lu:.2e}, err_QR={err_qr:.2e}, kappa_actual={kappa_actual:.2e}')
results.append((kappa_actual, err_lu, err_qr))
print('\nNote: Both LU and QR have similar forward errors for square systems.')
print('QR advantage is for LEAST SQUARES (avoids kappa^2 from normal equations).')
Code cell 27
# === 7.2 Normal Equations vs QR for Least Squares ===
np.random.seed(42)
errors_ne = []
errors_qr_ls = []
kappas = []
for kappa in np.logspace(1, 14, 30):
m, n = 40, 20
U, _ = np.linalg.qr(np.random.randn(m, n))
V, _ = np.linalg.qr(np.random.randn(n, n))
sv = np.logspace(0, -np.log10(kappa), n)
A = U @ np.diag(sv) @ V.T
x_true = np.random.randn(n)
b = A @ x_true + 1e-10 * np.random.randn(m) # tiny noise
# Normal equations: (A^T A) x = A^T b
try:
ATA = A.T @ A
ATb = A.T @ b
L_ne = la.cholesky(ATA, lower=True)
y = la.solve_triangular(L_ne, ATb, lower=True)
x_ne = la.solve_triangular(L_ne.T, y)
err_ne = np.linalg.norm(x_ne - x_true) / (np.linalg.norm(x_true) + 1e-14)
except Exception:
err_ne = 1.0
# QR via lstsq
x_qr_sol, _, _, _ = np.linalg.lstsq(A, b, rcond=None)
err_qr_sol = np.linalg.norm(x_qr_sol - x_true) / (np.linalg.norm(x_true) + 1e-14)
kappa_act = np.linalg.cond(A)
kappas.append(kappa_act)
errors_ne.append(err_ne)
errors_qr_ls.append(err_qr_sol)
if HAS_MPL:
fig, ax = plt.subplots(figsize=(9, 5))
ax.loglog(kappas, errors_ne, '.', color=COLORS['error'], alpha=0.7,
label='Normal equations (Cholesky of $A^TA$)')
ax.loglog(kappas, errors_qr_ls, 's', color=COLORS['tertiary'], alpha=0.7, markersize=4,
label='QR (Householder lstsq)')
ax.loglog(kappas, [k * 2e-16 for k in kappas], '--', color=COLORS['neutral'],
label=r'$\kappa \cdot u$ reference')
ax.loglog(kappas, [k**2 * 2e-16 for k in kappas], ':', color=COLORS['neutral'],
label=r'$\kappa^2 \cdot u$ reference')
ax.set_xlabel('Condition number $\\kappa(A)$')
ax.set_ylabel('Relative solution error')
ax.set_title('Normal equations vs QR for least squares')
ax.legend(fontsize=10)
ax.set_ylim(1e-18, 10)
fig.tight_layout()
plt.show()
print('Plot: QR tracks kappa*u; normal equations track kappa^2*u.')
Code cell 28
# === 7.3 Iterative Refinement in Mixed Precision ===
np.random.seed(42)
n = 30
# Build moderately ill-conditioned system
A_ir = build_ill_conditioned(n, 1e8)
x_true_ir = np.ones(n)
b_ir = A_ir @ x_true_ir
# FP32 factorization
A32 = A_ir.astype(np.float32)
lu32, piv32 = la.lu_factor(A32.astype(np.float64)) # simulate FP32
# Initial solve
x_curr = la.lu_solve((lu32, piv32), b_ir)
print('Iterative Refinement:')
for k in range(5):
r = b_ir - A_ir @ x_curr # FP64 residual
r_norm = np.linalg.norm(r) / np.linalg.norm(b_ir)
err_x = np.linalg.norm(x_curr - x_true_ir) / np.linalg.norm(x_true_ir)
print(f' Step {k}: ||r||/||b|| = {r_norm:.2e}, ||x-x*||/||x*|| = {err_x:.2e}')
if r_norm < 1e-14:
break
delta = la.lu_solve((lu32, piv32), r)
x_curr = x_curr + delta
print(f'Final error: {np.linalg.norm(x_curr - x_true_ir)/np.linalg.norm(x_true_ir):.2e}')
8. Blocked Algorithms and LAPACK
Blocked algorithms reformulate factorizations to use BLAS-3 (matrix-matrix) operations instead of BLAS-2 (matrix-vector), achieving arithmetic intensity and 5-10x speedup on modern CPUs/GPUs.
Code cell 30
# === 8.1 BLAS-2 vs BLAS-3 Arithmetic Intensity ===
import time
import numpy as np
# BLAS-2: matrix-vector multiply (reads n^2 words, does 2n^2 flops -> intensity O(1))
# BLAS-3: matrix-matrix multiply (reads 2n^2 words, does 2n^3 flops -> intensity O(n))
n = 500
A = np.random.randn(n, n)
x = np.random.randn(n)
B = np.random.randn(n, n)
nrep = 20
# BLAS-2: DGEMV
t0 = time.perf_counter()
for _ in range(nrep):
y = A @ x
t_gemv = (time.perf_counter() - t0) / nrep
# BLAS-3: DGEMM
t0 = time.perf_counter()
for _ in range(nrep):
C = A @ B
t_gemm = (time.perf_counter() - t0) / nrep
flops_gemv = 2 * n**2
flops_gemm = 2 * n**3
gflops_gemv = flops_gemv / t_gemv / 1e9
gflops_gemm = flops_gemm / t_gemm / 1e9
print(f'n = {n}')
print(f'DGEMV (BLAS-2): {t_gemv*1e3:.2f} ms, {gflops_gemv:.1f} GFLOP/s')
print(f'DGEMM (BLAS-3): {t_gemm*1e3:.2f} ms, {gflops_gemm:.1f} GFLOP/s')
print(f'BLAS-3 / BLAS-2 throughput ratio: {gflops_gemm / gflops_gemv:.1f}x')
print('\nThis ratio grows with n — blocked algorithms exploit BLAS-3.')
Code cell 31
# === 8.2 LAPACK Routine Reference and Timing ===
import scipy.linalg as la
import time
np.random.seed(42)
n = 300
X = np.random.randn(n, n + 50)
A_spd = X @ X.T / (n + 50) + np.eye(n)
A_gen = np.random.randn(n, n)
b_vec = np.random.randn(n)
timings = {}
# DGETRF: LU with partial pivoting
t0 = time.perf_counter()
for _ in range(5):
lu, piv = la.lu_factor(A_gen)
timings['DGETRF (LU)'] = (time.perf_counter() - t0) / 5
# DGEQRF: Householder QR
t0 = time.perf_counter()
for _ in range(5):
Q_t, R_t = la.qr(A_gen)
timings['DGEQRF (QR)'] = (time.perf_counter() - t0) / 5
# DPOTRF: Cholesky
t0 = time.perf_counter()
for _ in range(5):
L_t = la.cholesky(A_spd)
timings['DPOTRF (Chol)'] = (time.perf_counter() - t0) / 5
# DGETRS: triangular solve after LU
t0 = time.perf_counter()
for _ in range(20):
x_s = la.lu_solve((lu, piv), b_vec)
timings['DGETRS (solve)'] = (time.perf_counter() - t0) / 20
print(f'Timings for n={n}:')
for name, t in timings.items():
print(f' {name}: {t*1e3:.2f} ms')
# Theoretical flop counts
print(f'\nTheoretical flop counts:')
print(f' LU (2/3 n^3): {2/3 * n**3 / 1e6:.1f} Mflops')
print(f' QR (4/3 n^3): {4/3 * n**3 / 1e6:.1f} Mflops')
print(f' Chol (1/3 n^3): {1/3 * n**3 / 1e6:.1f} Mflops')
9. Gaussian Process Inference via Cholesky
GP regression requires solving and computing — both via Cholesky. This is the bottleneck in Bayesian optimization and uncertainty quantification.
Code cell 33
# === 9.1 GP Regression Implementation ===
import numpy as np, scipy.linalg as la
def rbf_kernel(X1, X2, ell=1.0, sf=1.0):
"""RBF/squared-exponential kernel: k(x,x') = sf^2 * exp(-||x-x'||^2 / (2*ell^2))."""
X1 = np.atleast_2d(X1)
X2 = np.atleast_2d(X2)
diff = X1[:, None, :] - X2[None, :, :] # (n1, n2, d)
sq_dist = np.sum(diff**2, axis=-1) # (n1, n2)
return sf**2 * np.exp(-sq_dist / (2 * ell**2))
def gp_predict(X_train, y_train, X_test, ell=1.0, sf=1.0, sigma_n=0.1):
"""GP posterior prediction using Cholesky for all linear solves."""
n = len(X_train)
K_nn = rbf_kernel(X_train, X_train, ell, sf)
K_ss = rbf_kernel(X_test, X_test, ell, sf)
K_sn = rbf_kernel(X_test, X_train, ell, sf)
# Noisy kernel: K + sigma_n^2 * I
K_noisy = K_nn + sigma_n**2 * np.eye(n)
# Cholesky factorization
L = la.cholesky(K_noisy, lower=True)
# Solve for alpha = (K + sigma^2 I)^{-1} y via forward+backward sub
v = la.solve_triangular(L, y_train, lower=True) # L v = y
alpha = la.solve_triangular(L.T, v, lower=False) # L^T alpha = v
# Posterior mean: mu_* = K_*n @ alpha
mu_star = K_sn @ alpha
# Posterior variance: Sigma_* = K_** - K_*n (K + sigma^2 I)^{-1} K_n*
V = la.solve_triangular(L, K_sn.T, lower=True) # L V = K_n*
var_star = np.diag(K_ss) - np.sum(V**2, axis=0)
# Log marginal likelihood
log_lik = -0.5 * (y_train @ alpha) - np.sum(np.log(np.diag(L))) - 0.5 * n * np.log(2*np.pi)
return mu_star, np.maximum(var_star, 0), log_lik
# Generate training data from f(x) = sin(3x)
np.random.seed(42)
n_train = 40
X_train = np.linspace(0, 2*np.pi, n_train).reshape(-1, 1)
f_true = np.sin(3 * X_train.ravel())
y_train = f_true + 0.1 * np.random.randn(n_train)
X_test = np.linspace(-0.2, 2*np.pi + 0.2, 200).reshape(-1, 1)
mu_star, var_star, log_lik = gp_predict(X_train, y_train, X_test, ell=0.6, sf=1.0, sigma_n=0.1)
print(f'GP prediction computed. Log marginal likelihood: {log_lik:.3f}')
print(f'Posterior mean range: [{mu_star.min():.2f}, {mu_star.max():.2f}]')
print(f'Posterior std range: [{np.sqrt(var_star).min():.4f}, {np.sqrt(var_star).max():.4f}]')
Code cell 34
# === 9.2 GP Posterior Visualization ===
if HAS_MPL:
COLORS = {'primary': '#0077BB', 'secondary': '#EE7733', 'tertiary': '#009988',
'error': '#CC3311', 'neutral': '#555555', 'highlight': '#EE3377'}
import matplotlib.pyplot as plt
std_star = np.sqrt(var_star)
x_plot = X_test.ravel()
fig, ax = plt.subplots(figsize=(10, 5))
ax.fill_between(x_plot, mu_star - 2*std_star, mu_star + 2*std_star,
alpha=0.2, color=COLORS['primary'], label='95% credible interval')
ax.plot(x_plot, mu_star, color=COLORS['primary'], label='Posterior mean')
ax.plot(x_plot, np.sin(3*x_plot), '--', color=COLORS['neutral'],
label='True function $\\sin(3x)$')
ax.scatter(X_train.ravel(), y_train, color=COLORS['secondary'],
s=30, zorder=5, label='Training data')
ax.set_xlabel('$x$')
ax.set_ylabel('$f(x)$')
ax.set_title('GP regression via Cholesky factorization')
ax.legend(fontsize=10)
fig.tight_layout()
plt.show()
print('GP posterior plot shown.')
Code cell 35
# === 9.3 GP Hyperparameter Optimization via Log Marginal Likelihood ===
# Grid search over (ell, sigma_n)
ell_vals = np.logspace(-1, 1, 20)
sig_vals = np.logspace(-2, 0, 20)
log_liks = np.zeros((len(ell_vals), len(sig_vals)))
for i, ell in enumerate(ell_vals):
for j, sig in enumerate(sig_vals):
try:
_, _, ll = gp_predict(X_train, y_train, X_train, ell=ell, sf=1.0, sigma_n=sig)
log_liks[i, j] = ll
except Exception:
log_liks[i, j] = -np.inf
best_idx = np.unravel_index(np.argmax(log_liks), log_liks.shape)
best_ell = ell_vals[best_idx[0]]
best_sig = sig_vals[best_idx[1]]
print(f'Best hyperparameters: ell={best_ell:.3f}, sigma_n={best_sig:.3f}')
print(f'Best log marginal likelihood: {log_liks[best_idx]:.3f}')
if HAS_MPL:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(8, 6))
im = ax.contourf(np.log10(sig_vals), np.log10(ell_vals), log_liks,
levels=30, cmap='viridis')
plt.colorbar(im, ax=ax, label='Log marginal likelihood')
ax.scatter(np.log10(best_sig), np.log10(best_ell), color='red',
marker='*', s=200, zorder=5, label='Best')
ax.set_xlabel('$\\log_{10}(\\sigma_n)$')
ax.set_ylabel('$\\log_{10}(\\ell)$')
ax.set_title('GP log marginal likelihood as function of hyperparameters')
ax.legend()
fig.tight_layout()
plt.show()
10. Differentiating Through Factorizations
Modern ML requires gradients through linear solves. Implicit differentiation through avoids unrolling operations and uses the already-computed factorization.
Code cell 37
# === 10.1 Gradient of log det A via Cholesky ===
import numpy as np, scipy.linalg as la
def log_det_spd(A):
"""log det A for SPD A, computed stably via Cholesky."""
L = la.cholesky(A, lower=True)
return 2 * np.sum(np.log(np.diag(L)))
def grad_log_det_cholesky(A):
"""Gradient of log det A = A^{-1}, computed via Cholesky."""
L = la.cholesky(A, lower=True)
L_inv = la.solve_triangular(L, np.eye(A.shape[0]), lower=True)
return L_inv.T @ L_inv # = A^{-1}
np.random.seed(42)
n = 8
X = np.random.randn(n, n + 5)
A_test = X @ X.T / (n + 5) + np.eye(n) * 0.5
# Analytical gradient
grad_ana = grad_log_det_cholesky(A_test)
# Finite difference verification
eps = 1e-5
grad_fd = np.zeros((n, n))
for i in range(n):
for j in range(n):
A_p = A_test.copy(); A_p[i,j] += eps
A_m = A_test.copy(); A_m[i,j] -= eps
grad_fd[i, j] = (log_det_spd(A_p) - log_det_spd(A_m)) / (2 * eps)
# For symmetric A, the gradient should be symmetrized
grad_fd_sym = (grad_fd + grad_fd.T) / 2
err_grad = np.linalg.norm(grad_ana - grad_fd_sym, 'fro') / np.linalg.norm(grad_ana, 'fro')
print(f'||grad_chol - grad_FD||_F / ||grad_chol||_F = {err_grad:.2e}')
print(f"{'PASS' if err_grad < 1e-4 else 'FAIL'} — Gradient of log det via Cholesky")
# Verify: grad = A^{-1}
A_inv = np.linalg.inv(A_test)
err_inv = np.linalg.norm(grad_ana - A_inv, 'fro') / np.linalg.norm(A_inv, 'fro')
print(f'||grad - A^{{-1}}||_F / ||A^{{-1}}||_F = {err_inv:.2e}')
print(f"{'PASS' if err_inv < 1e-10 else 'FAIL'} — grad log det A = A^{{-1}}")
Code cell 38
# === 10.2 Implicit Differentiation Through Solve ===
# If x = A^{-1} b and loss = f(x), then df/dA = -df/dx * x^T @ A^{-T}
# Equivalently: adjoint lambda = A^{-T} (df/dx), then df/dA = -lambda @ x^T
np.random.seed(42)
n = 10
A = np.random.randn(n, n) + n * np.eye(n) # diag-dominant for stability
b = np.random.randn(n)
# Forward: solve A x = b
x_sol = np.linalg.solve(A, b)
# Loss: f(x) = ||x||^2
f = np.sum(x_sol**2)
df_dx = 2 * x_sol
# Adjoint: lambda = A^{-T} df_dx
lam = np.linalg.solve(A.T, df_dx)
# Gradient: df/dA = -lambda @ x^T (for square A)
grad_A_ana = -np.outer(lam, x_sol)
# Finite difference verification
eps = 1e-5
grad_A_fd = np.zeros((n, n))
for i in range(n):
for j in range(n):
A_p = A.copy(); A_p[i,j] += eps
A_m = A.copy(); A_m[i,j] -= eps
x_p = np.linalg.solve(A_p, b)
x_m = np.linalg.solve(A_m, b)
grad_A_fd[i,j] = (np.sum(x_p**2) - np.sum(x_m**2)) / (2 * eps)
err = np.linalg.norm(grad_A_ana - grad_A_fd) / np.linalg.norm(grad_A_fd)
print(f'Implicit diff gradient error: {err:.2e}')
print(f"{'PASS' if err < 1e-3 else 'FAIL'} — Implicit differentiation through solve")
print('Cost: 2 triangular solves (forward + adjoint) — O(n^2) with cached factorization')
11. Randomized Factorizations
For matrices where only a low-rank approximation is needed, randomized algorithms achieve cost vs for exact QR. Underlying LoRA, GaLore, and sketch-and-solve methods.
Code cell 40
# === 11.1 Randomized QR/SVD (Halko-Martinsson-Tropp) ===
import numpy as np, scipy.linalg as la
def randomized_svd(A, r, n_oversampling=10, n_power_iter=2):
"""Compute rank-r SVD approximation via randomized sketch.
Halko, Martinsson, Tropp (2011) Algorithm 4.4 with power iteration."""
m, n = A.shape
k = r + n_oversampling
# Step 1: Random sketch
Omega = np.random.randn(n, k)
Y = A @ Omega
# Step 2: Power iteration for better approximation
for _ in range(n_power_iter):
Y = A @ (A.T @ Y)
# Step 3: Orthonormalize sketch
Q, _ = np.linalg.qr(Y)
Q = Q[:, :k]
# Step 4: Project and factor
B = Q.T @ A # (k x n) — small matrix
Uhat, S, Vt = np.linalg.svd(B, full_matrices=False)
# Step 5: Lift back
U = Q @ Uhat
return U[:, :r], S[:r], Vt[:r, :]
# Build a low-rank + noise matrix
np.random.seed(42)
m, n = 500, 300
r_true = 10
U_true = np.linalg.qr(np.random.randn(m, r_true))[0]
V_true = np.linalg.qr(np.random.randn(n, r_true))[0]
sv_true = np.exp(-np.arange(r_true) * 0.5)
A_lr = U_true @ np.diag(sv_true) @ V_true.T + 1e-3 * np.random.randn(m, n)
# Exact SVD
sv_exact = np.linalg.svd(A_lr, compute_uv=False)
# Randomized SVD
r_target = 10
U_r, S_r, Vt_r = randomized_svd(A_lr, r_target)
# Approximation error
A_approx = U_r @ np.diag(S_r) @ Vt_r
err_rand = np.linalg.norm(A_lr - A_approx, 'fro') / np.linalg.norm(A_lr, 'fro')
# Optimal error (Eckart-Young)
err_opt = np.sqrt(np.sum(sv_exact[r_target:]**2)) / np.linalg.norm(A_lr, 'fro')
print(f'Rank-{r_target} approximation:')
print(f' Randomized SVD error: {err_rand:.4f}')
print(f' Optimal error (Eckart-Young): {err_opt:.4f}')
print(f' Ratio: {err_rand/err_opt:.2f}x optimal')
print(f"{'PASS' if err_rand < 2 * err_opt else 'FAIL'} — Randomized SVD near-optimal")
Code cell 41
# === 11.2 LoRA-Inspired Initialization via Randomized QR ===
# LoRA: Delta W = B A, where B in R^{d x r}, A in R^{r x k}
# Key question: how to initialize B and A for good coverage of the weight matrix?
np.random.seed(42)
# Simulate a pretrained weight matrix
d, k = 256, 128
W = np.random.randn(d, k) / np.sqrt(k) # Xavier-like initialization
# Option 1: Standard LoRA init (B=0, A random)
r = 8
B_zero = np.zeros((d, r))
A_rand = np.random.randn(r, k) / np.sqrt(r)
Delta_W_lora = B_zero @ A_rand
# Option 2: SVD-based initialization (covers principal directions of W)
U_svd, S_svd, Vt_svd = randomized_svd(W, r)
B_svd = U_svd @ np.diag(np.sqrt(S_svd))
A_svd = np.diag(np.sqrt(S_svd)) @ Vt_svd
Delta_W_svd = B_svd @ A_svd
# How well does each capture the weight matrix?
err_lora_init = np.linalg.norm(W - Delta_W_lora, 'fro') / np.linalg.norm(W, 'fro')
err_svd_init = np.linalg.norm(W - Delta_W_svd, 'fro') / np.linalg.norm(W, 'fro')
print(f'LoRA init (B=0, A random): relative error = {err_lora_init:.4f}')
print(f'SVD init: relative error = {err_svd_init:.4f}')
print(f'Eckart-Young bound for rank-{r}: {np.sqrt(sum(np.linalg.svd(W, compute_uv=False)[r:]**2)) / np.linalg.norm(W):.4f}')
print(f'\nSVD initialization captures the low-rank structure of W at rank {r}.')
print('Standard LoRA starts from Delta W = 0 to preserve pretrained behavior.')
12. Machine Learning Applications
Factorizations appear as load-bearing mathematics in:
- Least squares (QR for linear probing, ridge regression)
- Second-order optimization (LU/Cholesky for Newton, K-FAC)
- GP inference (Cholesky — already shown)
- Gradient compression (randomized SVD/QR — GaLore, LoRA)
Code cell 43
# === 12.1 K-FAC: Kronecker-Factored Cholesky for Natural Gradient ===
import numpy as np, scipy.linalg as la
# K-FAC approximates the Fisher information matrix as:
# F_l ≈ A_l ⊗ G_l (Kronecker product of activation and gradient covariances)
# Inverse: (A_l ⊗ G_l)^{-1} = A_l^{-1} ⊗ G_l^{-1}
# Both A_l^{-1} and G_l^{-1} computed via Cholesky
def kfac_step(A, G, gradient, damping=1e-3):
"""Compute K-FAC natural gradient step.
A: activation covariance (d_in x d_in)
G: gradient covariance (d_out x d_out)
gradient: weight gradient (d_out x d_in)
Returns natural gradient as (d_out x d_in) matrix.
"""
d_in = A.shape[0]
d_out = G.shape[0]
# Add damping for numerical stability
A_reg = A + damping * np.eye(d_in)
G_reg = G + damping * np.eye(d_out)
# Invert via Cholesky
L_A = la.cholesky(A_reg, lower=True)
L_G = la.cholesky(G_reg, lower=True)
A_inv = la.cho_solve((L_A, True), np.eye(d_in))
G_inv = la.cho_solve((L_G, True), np.eye(d_out))
# Natural gradient: F^{-1} g = (A^{-1} ⊗ G^{-1}) vec(G) = G_inv @ gradient @ A_inv
nat_grad = G_inv @ gradient @ A_inv
return nat_grad
# Simulate one K-FAC step
np.random.seed(42)
d_in, d_out = 64, 32
batch_size = 128
# Random activations and pre-activation gradients
acts = np.random.randn(batch_size, d_in) # activations
pre_grads = np.random.randn(batch_size, d_out) # pre-activation gradients
# Estimate factor covariances
A_cov = (acts.T @ acts) / batch_size # d_in x d_in
G_cov = (pre_grads.T @ pre_grads) / batch_size # d_out x d_out
# Weight gradient
W_grad = (pre_grads.T @ acts) / batch_size # d_out x d_in
nat_gradient = kfac_step(A_cov, G_cov, W_grad)
print(f'K-FAC natural gradient shape: {nat_gradient.shape}')
print(f'Gradient norm: {np.linalg.norm(W_grad):.4f}')
print(f'Natural gradient norm: {np.linalg.norm(nat_gradient):.4f}')
# Cholesky cost: two n^3/3 factorizations
flops_chol_A = d_in**3 / 3
flops_chol_G = d_out**3 / 3
flops_mat_mul = d_out**2 * d_in + d_out * d_in**2 # G_inv @ grad @ A_inv
print(f'\nK-FAC step flops: {flops_chol_A + flops_chol_G + flops_mat_mul:.0f}')
print(f'Full Fisher inverse would cost: {(d_in*d_out)**3:.2e} flops (infeasible)')
Code cell 44
# === 12.2 Linear Probing via QR Least Squares ===
import numpy as np, scipy.linalg as la
# Simulate: extract features from a pretrained model, fit linear head
np.random.seed(42)
n_samples = 1000 # training samples
d_feat = 256 # feature dimension
n_classes = 10 # number of classes
# Synthetic features (roughly like neural net embeddings)
X_feat = np.random.randn(n_samples, d_feat) / np.sqrt(d_feat)
# True weights (one-hot labels encoded continuously)
W_true = np.random.randn(d_feat, n_classes) / np.sqrt(d_feat)
Y = X_feat @ W_true + 0.01 * np.random.randn(n_samples, n_classes)
# Method 1: Normal equations (A^T A W = A^T Y)
XtX = X_feat.T @ X_feat
XtY = X_feat.T @ Y
W_ne = np.linalg.solve(XtX, XtY) # solves (X^T X) W = X^T Y
# Method 2: QR-based least squares (more stable)
W_qr, _, _, _ = np.linalg.lstsq(X_feat, Y, rcond=None)
# Method 3: Ridge regression (regularized)
lam = 1e-4
W_ridge = np.linalg.solve(XtX + lam * np.eye(d_feat), XtY)
# Training residuals
res_ne = np.linalg.norm(X_feat @ W_ne - Y, 'fro') / np.linalg.norm(Y, 'fro')
res_qr = np.linalg.norm(X_feat @ W_qr - Y, 'fro') / np.linalg.norm(Y, 'fro')
res_ridge = np.linalg.norm(X_feat @ W_ridge - Y, 'fro') / np.linalg.norm(Y, 'fro')
err_ne = np.linalg.norm(W_ne - W_true, 'fro') / np.linalg.norm(W_true, 'fro')
err_qr = np.linalg.norm(W_qr - W_true, 'fro') / np.linalg.norm(W_true, 'fro')
err_ridge = np.linalg.norm(W_ridge - W_true, 'fro') / np.linalg.norm(W_true, 'fro')
print('Linear probing results:')
print(f' Normal equations: residual={res_ne:.4f}, solution error={err_ne:.4f}')
print(f' QR (lstsq): residual={res_qr:.4f}, solution error={err_qr:.4f}')
print(f' Ridge (lambda={lam}): residual={res_ridge:.4f}, solution error={err_ridge:.4f}')
print(f'Condition number of X: {np.linalg.cond(X_feat):.2e}')
13. Summary: When to Use Which Factorization
The choice of factorization depends on matrix structure, problem type, and stability requirements.
Code cell 46
# === 13.1 Decision Table with Numerical Verification ===
import numpy as np, scipy.linalg as la
np.random.seed(42)
def make_spd(n, kappa=10):
U, _ = np.linalg.qr(np.random.randn(n, n))
sv = np.logspace(0, -np.log10(kappa), n)
return U @ np.diag(sv) @ U.T
def make_general(n):
return np.random.randn(n, n) + n * np.eye(n)
def make_overdetermined(m, n):
return np.random.randn(m, n)
n = 50
A_spd = make_spd(n)
A_gen = make_general(n)
A_over = make_overdetermined(2*n, n)
b = np.random.randn(n)
b_long = np.random.randn(2*n)
x_true = np.ones(n)
print('=== Solving Ax = b ===')
# SPD: use Cholesky
b_spd = A_spd @ x_true
L_c = la.cholesky(A_spd, lower=True)
x_chol = la.cho_solve((L_c, True), b_spd)
print(f'Cholesky (SPD): error = {np.linalg.norm(x_chol - x_true)/np.linalg.norm(x_true):.2e}')
# General: use LU
b_gen = A_gen @ x_true
lu_f, piv_f = la.lu_factor(A_gen)
x_lu = la.lu_solve((lu_f, piv_f), b_gen)
print(f'LU partial pivot (general): error = {np.linalg.norm(x_lu - x_true)/np.linalg.norm(x_true):.2e}')
print('\n=== Solving Least Squares min ||Ax-b|| ===')
# Overdetermined: use QR
x_true_ls = np.ones(n)
b_ls = A_over @ x_true_ls + 0.01 * np.random.randn(2*n)
x_qr_ls, _, _, _ = np.linalg.lstsq(A_over, b_ls, rcond=None)
print(f'QR lstsq: error = {np.linalg.norm(x_qr_ls - x_true_ls)/np.linalg.norm(x_true_ls):.2e}')
print('\n=== Summary Table ===')
print('Matrix type | Algorithm | Cost | Stability')
print('-' * 65)
print('SPD | Cholesky (dpotrf) | n^3/3 | Unconditional')
print('Sym. indefinite | LDL^T (dsytrf) | n^3/3 | Bunch-Kaufman')
print('General square | LU + partial pivot | 2n^3/3 | Practical')
print('Overdetermined | Householder QR | 2mn^2 | Unconditional')
print('Rank-deficient | RRQR or SVD | 2mn^2 | Unconditional')
Appendix: Condition Number Estimation and Sparse Factorizations
Condition estimation and fill-in reduction for sparse systems.
Code cell 48
# === A.1 Condition Number Estimation ===
import numpy as np, scipy.linalg as la
# LAPACK DGECON: estimate condition number from LU factorization
# scipy wraps this in scipy.linalg.rcond
np.random.seed(42)
sizes_cond = [10, 50, 100, 200]
print('Condition number estimation:')
print(f'{"n":>5} | {"True kappa":>12} | {"Estimated (DGECON-like)":>22} | {"Ratio":>8}')
print('-' * 55)
for n in sizes_cond:
# Build matrix with known condition number
U, _ = np.linalg.qr(np.random.randn(n, n))
V, _ = np.linalg.qr(np.random.randn(n, n))
kappa_target = 10**np.random.uniform(2, 8)
sv = np.logspace(0, -np.log10(kappa_target), n)
A = U @ np.diag(sv) @ V.T
kappa_true = np.linalg.cond(A, 2) # exact, uses SVD
kappa_est = 1.0 / np.linalg.matrix_rank(A, tol=None) # placeholder
# Use reciprocal condition (cheaper than full cond)
rcond_est = np.linalg.lstsq(A, np.eye(n), rcond=None)[3].min() / sv[0]
kappa_lapack = 1.0 / (np.min(la.svd(A, compute_uv=False)) / np.max(la.svd(A, compute_uv=False)))
print(f'{n:>5} | {kappa_true:>12.2e} | {kappa_lapack:>22.2e} | {kappa_lapack/kappa_true:>8.2f}')
print('\nRule of thumb: if kappa = 10^k, lose k digits of precision in FP64 (16 digit)')
Code cell 49
# === A.2 Sparse Fill-In Demonstration ===
import numpy as np
# Sparse tridiagonal matrix: bandwidth 1
n = 10
A_tridiag = np.diag(4 * np.ones(n)) + np.diag(-1 * np.ones(n-1), 1) + np.diag(-1 * np.ones(n-1), -1)
# LU factorization
import scipy.linalg as la
P, L, U = la.lu(A_tridiag)
# Count nonzeros
tol = 1e-10
nnz_A = np.sum(np.abs(A_tridiag) > tol)
nnz_L = np.sum(np.abs(L) > tol)
nnz_U = np.sum(np.abs(U) > tol)
print(f'Tridiagonal matrix ({n}x{n}):')
print(f' Nonzeros in A: {nnz_A} (density {nnz_A/n**2:.1%})')
print(f' Nonzeros in L: {nnz_L} (density {nnz_L/n**2:.1%})')
print(f' Nonzeros in U: {nnz_U} (density {nnz_U/n**2:.1%})')
print(f' Fill-in: L+U has {nnz_L+nnz_U} nonzeros vs {nnz_A} in A')
print('\nFor tridiagonal: LU stays bidiagonal (no fill-in!).')
print('For general sparse: reordering (AMD, nested dissection) minimizes fill-in.')
# Cholesky of SPD tridiagonal (positive definite version)
A_spd_tri = A_tridiag.copy() # already symmetric; check PD
eigmin = np.linalg.eigvalsh(A_spd_tri).min()
print(f'\nSmallest eigenvalue of tridiagonal: {eigmin:.4f} (positive -> SPD)')
L_chol = la.cholesky(A_spd_tri, lower=True)
nnz_Lchol = np.sum(np.abs(L_chol) > tol)
print(f'Cholesky L: {nnz_Lchol} nonzeros (same structure as lower band)')
Code cell 50
# === A.3 Complete Worked Example: Solving a GP System Step by Step ===
import numpy as np, scipy.linalg as la
# Complete pipeline: build kernel matrix, factor, solve, compute log-det
np.random.seed(42)
# 1. Training data
n_gp = 30
X_gp = np.linspace(0, 4*np.pi, n_gp).reshape(-1, 1)
y_gp = np.sin(X_gp.ravel()) + 0.1 * np.random.randn(n_gp)
# 2. Kernel matrix (RBF, ell=1.0, sf=1.0)
ell, sf, sigma_n = 1.0, 1.0, 0.1
diff = X_gp[:, None, :] - X_gp[None, :, :]
K = sf**2 * np.exp(-np.sum(diff**2, axis=-1) / (2 * ell**2))
K_noisy = K + sigma_n**2 * np.eye(n_gp)
print(f'Kernel matrix size: {K_noisy.shape}')
print(f'Condition number: {np.linalg.cond(K_noisy):.2e}')
# 3. Cholesky factorization
L_gp = la.cholesky(K_noisy, lower=True)
print(f'Cholesky computed. L diagonal range: [{L_gp.diagonal().min():.4f}, {L_gp.diagonal().max():.4f}]')
# 4. Solve for alpha = K^{-1} y via forward + backward substitution
v_gp = la.solve_triangular(L_gp, y_gp, lower=True) # L v = y
alpha = la.solve_triangular(L_gp.T, v_gp, lower=False) # L^T alpha = v
# 5. Log marginal likelihood
log_det = 2 * np.sum(np.log(L_gp.diagonal())) # log det K = 2 sum log L_ii
data_fit = 0.5 * y_gp @ alpha
complexity = 0.5 * log_det
const = 0.5 * n_gp * np.log(2 * np.pi)
log_marg_lik = -(data_fit + complexity + const)
print(f'\nLog marginal likelihood decomposition:')
print(f' Data fit term: -{data_fit:.3f}')
print(f' Complexity term: -{complexity:.3f} (log det = {log_det:.3f})')
print(f' Constant: -{const:.3f}')
print(f' Total: {log_marg_lik:.3f}')
# 6. Verify: alpha should satisfy K_noisy @ alpha ≈ y_gp
residual = np.linalg.norm(K_noisy @ alpha - y_gp) / np.linalg.norm(y_gp)
print(f'\nResidual ||K alpha - y|| / ||y|| = {residual:.2e}')
print(f"{'PASS' if residual < 1e-10 else 'FAIL'} — GP system solved accurately via Cholesky")
Code cell 51
# === A.4 Benchmarking All Three Factorizations Together ===
import numpy as np, scipy.linalg as la, time
np.random.seed(42)
print('Final benchmark: LU vs QR vs Cholesky')
print(f'{"n":>6} | {"LU (ms)":>9} | {"QR (ms)":>9} | {"Chol (ms)":>10} | '
f'{"Chol/LU":>8} | {"Chol theory"}' )
print('-' * 70)
for n in [50, 100, 200, 400, 800]:
# Build SPD matrix
X = np.random.randn(n, n + 20)
A_spd = X @ X.T / (n+20) + np.eye(n)
A_gen = A_spd.copy() # also a valid general matrix
reps = max(3, int(1000 / n))
t0 = time.perf_counter()
for _ in range(reps):
la.lu_factor(A_gen)
t_lu = (time.perf_counter() - t0) / reps * 1e3
t0 = time.perf_counter()
for _ in range(reps):
la.qr(A_gen)
t_qr = (time.perf_counter() - t0) / reps * 1e3
t0 = time.perf_counter()
for _ in range(reps):
la.cholesky(A_spd)
t_chol = (time.perf_counter() - t0) / reps * 1e3
ratio = t_chol / t_lu
theoretical = '~0.5' if ratio < 0.7 else f'{ratio:.2f}'
print(f'{n:>6} | {t_lu:>9.2f} | {t_qr:>9.2f} | {t_chol:>10.2f} | '
f'{ratio:>8.2f} | {theoretical}')
print('\nCholesky should be ~0.5x the cost of LU (symmetric exploitation).')
print('QR is more expensive than LU (orthogonalization overhead).')
print('\nAll factorization cells complete.')
Summary
This notebook covered all major matrix decompositions as computational tools:
| Section | Algorithm | Key insight |
|---|---|---|
| 1 | Forward/backward substitution | Triangular = solved; |
| 2 | LU with partial pivoting | $ |
| 3 | Householder QR | Sign convention; |
| 4 | Givens rotations | Selective entry zeroing; sparse/streaming |
| 5 | Rank-revealing QR | $ |
| 6 | Cholesky (computational) | Half cost of LU; |
| 7 | Backward error | Forward error backward error |
| 8 | Blocked algorithms | BLAS-3 near-peak hardware throughput |
| 9 | GP inference | Cholesky for and |
| 10 | Differentiation | Implicit diff through ; adjoint |
| 11 | Randomized SVD | near-optimal approximation (LoRA, GaLore) |
| 12 | ML applications | K-FAC, linear probing, natural gradient |
Next: §04 Calculus Fundamentals where Hessians (Chapter 4 §05) will draw on Cholesky (SPD at minima) and Newton's method will require LU/Cholesky for Hessian systems.