Exercises NotebookMath for LLMs

Numerical Linear Algebra

Numerical Methods / Numerical Linear Algebra

Run notebook
Exercises Notebook

Exercises Notebook

Converted from exercises.ipynb for web reading.

Numerical Linear Algebra — Exercises

"The best way to understand an algorithm is to implement it and watch it fail."

10 graded exercises covering: LU with pivoting, QR stability, conjugate gradient, condition numbers, preconditioning, iterative refinement, power iteration, and sparse systems.

Companion: notes.md | theory.ipynb

Difficulty: ★ Easy | ★★ Medium | ★★★ Hard

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import numpy.linalg as nla
from scipy import stats
import scipy.linalg as sla

try:
    import matplotlib.pyplot as plt
    import matplotlib as mpl
    mpl.rcParams.update({'figure.figsize': (10, 5), 'figure.dpi': 110})
    HAS_MPL = True
except ImportError:
    HAS_MPL = False

np.set_printoptions(precision=8, suppress=True)
np.random.seed(42)
print('Setup complete.')

print("Chapter helper setup complete.")

def header(title):
    print("\n" + "=" * len(title))
    print(title)
    print("=" * len(title))

def check_true(name, cond):
    print(f"{'PASS' if cond else 'FAIL'} - {name}")
    return bool(cond)

def check_close(name, got, expected, tol=1e-8):
    ok = np.allclose(got, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    if not ok:
        print('  expected:', expected)
        print('  got     :', got)
    return bool(ok)

print("Chapter helper setup complete.")

Exercise 1 ★ — LU Factorization with Partial Pivoting

Implement Gaussian elimination with partial pivoting from scratch.

(a) Implement lu_gepp(A) returning (P, L, U) such that P @ A == L @ U.

(b) Implement solve_lu(A, b) using your factorization plus forward/backward substitution.

(c) Compare your solution to numpy.linalg.solve on a random 6×66 \times 6 matrix.

(d) Demonstrate that without pivoting, a near-zero diagonal entry causes large errors: test with A=(1012111)A = \begin{pmatrix} 10^{-12} & 1 \\ 1 & 1 \end{pmatrix}.

Code cell 5

# Your Solution
# Exercise 1: LU with Partial Pivoting

def lu_gepp(A):
    """Gaussian elimination with partial pivoting."""

print("Learner scaffold loaded; write your solution here, then compare below.")

Code cell 6

# Solution
def lu_gepp_sol(A):
    n = A.shape[0]
    A = A.astype(np.float64).copy()
    P = np.eye(n)
    L = np.zeros((n, n))
    for k in range(n - 1):
        pr = k + np.argmax(np.abs(A[k:, k]))
        if pr != k:
            A[[k, pr], :]  = A[[pr, k], :]
            P[[k, pr], :]  = P[[pr, k], :]
            L[[k, pr], :k] = L[[pr, k], :k]
        for i in range(k+1, n):
            L[i, k] = A[i, k] / A[k, k]
            A[i, k:] -= L[i, k] * A[k, k:]
    L += np.eye(n)
    return P, L, np.triu(A)

def fwd_sol(L, b):
    n = len(b)
    x = np.zeros(n)
    for i in range(n): x[i] = b[i] - L[i,:i] @ x[:i]
    return x

def bck_sol(U, b):
    n = len(b)
    x = np.zeros(n)
    for i in range(n-1,-1,-1): x[i] = (b[i] - U[i,i+1:] @ x[i+1:]) / U[i,i]
    return x

def solve_lu_sol(A, b):
    P, L, U = lu_gepp_sol(A)
    return bck_sol(U, fwd_sol(L, P @ b))

np.random.seed(42)
A = np.random.randn(6, 6)
b = np.random.randn(6)
x_lu = solve_lu_sol(A, b)
x_np = nla.solve(A, b)
print(f'(c) Max difference vs numpy: {np.abs(x_lu - x_np).max():.2e}')
print(f'    Residual: {nla.norm(A @ x_lu - b):.2e}')

# (d) Near-zero pivot
eps = 1e-12
A2 = np.array([[eps, 1.0], [1.0, 1.0]])
b2 = np.array([1.0, 2.0])
x_true = nla.solve(A2, b2)
x_gepp = solve_lu_sol(A2, b2)
print(f'\n(d) Near-zero pivot (eps=1e-12):')
print(f'    True:  {x_true}')
print(f'    GEPP:  {x_gepp}')
print(f'    Error: {nla.norm(x_gepp - x_true) / nla.norm(x_true):.2e}')

ok = np.allclose(x_lu, x_np) and nla.norm(x_gepp - x_true) / nla.norm(x_true) < 1e-10
print(f"{'PASS' if ok else 'FAIL'} - LU with pivoting verified")

Exercise 2 ★★ — Condition Number Experiment

Verify the rule: if κ(A)=10k\kappa(A) = 10^k, approximately kk significant digits are lost.

(a) For n=10n = 10, build matrices with κ(A){101,103,105,107,109,1012}\kappa(A) \in \{10^1, 10^3, 10^5, 10^7, 10^9, 10^{12}\}. Use SVD to construct A=UΣVA = U \Sigma V^\top with known singular values.

(b) Solve Ax=bAx = b where x=1x = \mathbf{1} (so b=A1b = A \cdot \mathbf{1}).

(c) Report: κ(A)\kappa(A), log10(κ)\log_{10}(\kappa), relative forward error, and digits lost.

(d) Plot relative error vs log10(κ)\log_{10}(\kappa) and compare to the theoretical prediction.

Code cell 8

# Your Solution
# Exercise 2: Condition Number Experiment

def build_matrix_with_kappa(kappa, n=10):
    """Build n x n matrix with given condition number via SVD."""

print("Learner scaffold loaded; write your solution here, then compare below.")

Code cell 9

# Solution
def build_matrix_kappa_sol(kappa, n=10):
    Q, _ = nla.qr(np.random.randn(n, n))
    sigma = np.logspace(0, np.log10(kappa), n)
    return Q @ np.diag(sigma) @ Q.T

kappas = [1e1, 1e3, 1e5, 1e7, 1e9, 1e12]
kappa_actual = []
rel_errors = []

print(f'{"κ(A) target":>14}  {"κ(A) actual":>14}  {"rel error":>12}  {"digits lost":>12}')
for kappa in kappas:
    np.random.seed(42)
    A = build_matrix_kappa_sol(kappa)
    x_true = np.ones(10)
    b = A @ x_true
    x_comp = nla.solve(A, b)
    k_actual = nla.cond(A)
    rel_err = nla.norm(x_comp - x_true) / nla.norm(x_true)
    digits = max(0, np.log10(max(rel_err, 1e-16)))
    kappa_actual.append(k_actual)
    rel_errors.append(rel_err)
    print(f'{kappa:14.1e}  {k_actual:14.3e}  {rel_err:12.3e}  {digits:12.1f}')

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(8, 5))
    log_kappa = np.log10(np.array(kappa_actual))
    log_err = np.log10(np.maximum(rel_errors, 1e-16))
    ax.plot(log_kappa, log_err, 'o-', color='#0077BB', label='Actual error')
    ax.plot([0, 15], [-15, 0], 'r--', label='Theoretical: log(err) = log(κ) - 15')
    ax.set_xlabel('log₁₀(κ)')
    ax.set_ylabel('log₁₀(relative error)')
    ax.set_title('Digit loss vs condition number')
    ax.legend()
    plt.tight_layout()
    plt.show()

ok = rel_errors[2] < 1e-5 and rel_errors[4] > 1e-8  # Moderate κ ok, high κ shows error
print(f"{'PASS' if ok else 'FAIL'} - condition number effect on accuracy verified")

Exercise 3 ★★ — QR Stability: Normal Equations vs Householder

(a) Generate AR40×8A \in \mathbb{R}^{40 \times 8} with singular values σi=κ(i1)/(n1)\sigma_i = \kappa^{(i-1)/(n-1)} for varying κ\kappa.

(b) Compute the least squares solution x=argminxAxbx^* = \arg\min_x \|Ax - b\| via: - Normal equations: form AAA^\top A and solve - numpy.linalg.lstsq (Householder QR)

(c) For each κ\kappa, compute the relative error in xx^*.

(d) Plot log10\log_{10}(error) vs log10(κ)\log_{10}(\kappa). Show that Normal equations fail at κ107\kappa \approx 10^7, while QR works until κ1015\kappa \approx 10^{15}.

Code cell 11

# Your Solution
# Exercise 3: Normal Equations vs QR

from fractions import Fraction  # For exact comparison

def solve_normal(A, b):
    """Solve least squares via Normal equations."""

print("Learner scaffold loaded; write your solution here, then compare below.")

Code cell 12

# Solution
np.random.seed(42)
m, n = 40, 8
kappas = np.logspace(1, 14, 20)
err_normal = []
err_qr_sol = []

for kappa in kappas:
    # Build A with known singular values
    U, _ = nla.qr(np.random.randn(m, n))
    Vt, _ = nla.qr(np.random.randn(n, n))
    sigma = np.logspace(0, np.log10(kappa), n)
    A = U @ np.diag(sigma) @ Vt

    x_true = np.ones(n)
    b = A @ x_true + 0.001 * np.random.randn(m)

    # Normal equations
    try:
        x_n = nla.solve(A.T @ A, A.T @ b)
        e_n = nla.norm(x_n - nla.lstsq(A, b, rcond=None)[0]) / nla.norm(x_true)
    except:
        e_n = 1.0
    err_normal.append(max(e_n, 1e-16))

    # QR (lstsq)
    x_q, _, _, _ = nla.lstsq(A, b, rcond=None)
    e_q = nla.norm(x_q - x_true) / nla.norm(x_true)
    err_qr_sol.append(max(e_q, 1e-16))

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.loglog(kappas, err_normal, color='#CC3311', label='Normal equations')
    ax.loglog(kappas, err_qr_sol, color='#0077BB', label='Householder QR')
    ax.axvline(1e7, color='gray', linestyle=':', label='κ = 10^7 (fp64 danger)')
    ax.set_xlabel('Condition number κ(A)')
    ax.set_ylabel('Relative error in solution')
    ax.set_title('Least squares accuracy: Normal equations vs QR')
    ax.legend()
    plt.tight_layout()
    plt.show()

# Verify QR beats normal equations for ill-conditioned case
idx_ill = int(len(kappas) * 0.7)
ok = err_qr_sol[idx_ill] < err_normal[idx_ill]
print(f"{'PASS' if ok else 'FAIL'} - QR more accurate than Normal equations for ill-conditioned case")

Exercise 4 ★★ — Conjugate Gradient

(a) Implement the Conjugate Gradient algorithm for SPD systems Ax=bAx = b.

(b) Test on a 50×5050 \times 50 SPD matrix. Track and plot the relative residual rk/b\|r_k\|/\|b\|.

(c) Verify the convergence bound: ekA/e0A2(κ1κ+1)k\|e_k\|_A / \|e_0\|_A \leq 2\left(\frac{\sqrt{\kappa}-1}{\sqrt{\kappa}+1}\right)^k.

(d) Add diagonal preconditioning (scale by D1D^{-1} where D=diag(A)D = \text{diag}(A)) and compare convergence.

Code cell 14

# Your Solution
# Exercise 4: Conjugate Gradient

def cg(A, b, tol=1e-10, max_iter=None):
    """
    Conjugate gradient for SPD Ax = b.
    Returns (x, residuals) where residuals[k] = ||r_k|| / ||b||.
    """

print("Learner scaffold loaded; write your solution here, then compare below.")

Code cell 15

# Solution
def cg_sol(A, b, tol=1e-10, max_iter=None):
    n = len(b)
    if max_iter is None: max_iter = 3 * n
    x = np.zeros(n)
    r = b - A @ x
    p = r.copy()
    rs_old = r @ r
    bnorm = nla.norm(b)
    residuals = [np.sqrt(rs_old) / bnorm]
    for _ in range(max_iter):
        Ap = A @ p
        alpha = rs_old / (p @ Ap)
        x += alpha * p
        r -= alpha * Ap
        rs_new = r @ r
        residuals.append(np.sqrt(rs_new) / bnorm)
        if residuals[-1] < tol: break
        p = r + (rs_new / rs_old) * p
        rs_old = rs_new
    return x, residuals

np.random.seed(42)
n = 50
kappa = 100
lam = np.linspace(1, kappa, n)
Q, _ = nla.qr(np.random.randn(n, n))
A_spd = Q @ np.diag(lam) @ Q.T
b_vec = np.random.randn(n)
x_true = nla.solve(A_spd, b_vec)

x_cg, res = cg_sol(A_spd, b_vec)
print(f'CG: {len(res)-1} iterations, final residual: {res[-1]:.2e}')
print(f'Forward error: {nla.norm(x_cg - x_true)/nla.norm(x_true):.2e}')

# (c) Compare to bound
k_vals = np.arange(len(res))
bound = [2 * ((np.sqrt(kappa)-1)/(np.sqrt(kappa)+1))**k for k in k_vals]

# (d) Diagonal preconditioner
D_inv = np.diag(1.0 / np.diag(A_spd))
A_pc = D_inv @ A_spd
b_pc = D_inv @ b_vec
x_pc, res_pc = cg_sol(A_pc, b_pc)

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.semilogy(res, color='#0077BB', label='CG (no precond)')
    ax.semilogy(res_pc, color='#009988', label='CG (diagonal precond)')
    ax.semilogy(k_vals[:len(res)], bound[:len(res)], 'r--', label='Theoretical bound')
    ax.set_xlabel('Iteration'); ax.set_ylabel('Relative residual')
    ax.set_title(f'CG convergence (κ={kappa}): with and without preconditioning')
    ax.legend()
    plt.tight_layout(); plt.show()

ok = len(res_pc) <= len(res)  # Precond should help or equal
print(f"{'PASS' if ok else 'FAIL'} - preconditioning converges in fewer iterations")

Exercise 5 ★★ — Power Iteration and Spectral Gap

(a) Implement power iteration to find the dominant eigenvalue.

(b) Build a 20×2020 \times 20 symmetric matrix with eigenvalues λ1=10\lambda_1 = 10, λ2=5\lambda_2 = 5, and λ3,,λ20\lambda_3, \ldots, \lambda_{20} uniform in [1,4][1, 4]. Run power iteration.

(c) Track the Rayleigh quotient at each step. Plot convergence.

(d) Compute the predicted convergence rate λ2/λ1|\lambda_2/\lambda_1| and verify empirically.

Code cell 17

# Your Solution
# Exercise 5: Power Iteration

def power_iter(A, tol=1e-10, max_iter=1000):
    """Power iteration. Returns (lambda, eigvec, rayleigh_history)."""

print("Learner scaffold loaded; write your solution here, then compare below.")

Code cell 18

# Solution
def power_iter_sol(A, tol=1e-10, max_iter=1000):
    n = A.shape[0]
    x = np.random.randn(n)
    x /= nla.norm(x)
    history = []
    lam_old = 0.0
    for k in range(max_iter):
        y = A @ x
        lam = x @ y  # Rayleigh quotient
        history.append(lam)
        x = y / nla.norm(y)
        if abs(lam - lam_old) < tol: break
        lam_old = lam
    return lam, x, history

np.random.seed(42)
n = 20
lam_true = np.array([10., 5.] + list(np.linspace(1, 4, 18)))
Q, _ = nla.qr(np.random.randn(n, n))
A_test = Q @ np.diag(lam_true) @ Q.T

lam1, v1, history = power_iter_sol(A_test)
print(f'lambda_1 = {lam1:.8f} (true: {lam_true[0]:.1f})')
print(f'Converged in {len(history)} iterations')

# (d) Convergence rate
rate = abs(lam_true[1] / lam_true[0])
print(f'Predicted rate: |lambda_2/lambda_1| = {rate:.3f}')

errors = [abs(h - lam_true[0]) for h in history]

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(8, 4))
    ax.semilogy(errors[1:], color='#0077BB', label='|Rayleigh quotient - lambda_1|')
    k_vals = np.arange(1, len(history))
    ax.semilogy(k_vals, errors[1] * rate**(k_vals - 1), 'r--', label=f'Rate = {rate:.2f}^k')
    ax.set_xlabel('Iteration'); ax.set_ylabel('Error (log scale)')
    ax.set_title('Power iteration convergence')
    ax.legend()
    plt.tight_layout(); plt.show()

ok = abs(lam1 - lam_true[0]) < 1e-8
print(f"{'PASS' if ok else 'FAIL'} - power iteration converged to lambda_1")

Exercise 6 ★★ — Iterative Refinement

(a) Factor AA in fp32 using scipy.linalg.lu_factor (cast to float32).

(b) Solve Ax=bAx = b using the fp32 factorization and measure the error vs the true fp64 solution.

(c) Apply iterative refinement: - Compute residual r=bAx^r = b - A\hat{x} in fp64 - Solve Aδ=rA \delta = r using the fp32 factorization - Update x^x^+δ\hat{x} \leftarrow \hat{x} + \delta - Repeat until convergence

(d) Plot error vs refinement step. Verify convergence to fp64 accuracy.

Code cell 20

# Your Solution
# Exercise 6: Iterative Refinement

# (a) Factor in fp32
np.random.seed(42)
n = 30
kappa = 1e4  # Moderate condition number
lam_k = np.logspace(0, 4, n)
Q, _ = nla.qr(np.random.randn(n, n))
A_fp64 = Q @ np.diag(lam_k) @ Q.T
A_fp32 = A_fp64.astype(np.float32)
b_fp64 = np.random.randn(n)
x_true = nla.solve(A_fp64, b_fp64)

# YOUR CODE: factor in fp32, solve, then refine

print("Learner scaffold loaded; write your solution here, then compare below.")

Code cell 21

# Solution
np.random.seed(42)
n = 30
lam_k = np.logspace(0, 4, n)
Q, _ = nla.qr(np.random.randn(n, n))
A_fp64 = Q @ np.diag(lam_k) @ Q.T
A_fp32 = A_fp64.astype(np.float32)
b_fp64 = np.random.randn(n)
x_true = nla.solve(A_fp64, b_fp64)

# (a) Factor in fp32
lu32, piv32 = sla.lu_factor(A_fp32)

# (b) Solve in fp32
x_hat = sla.lu_solve((lu32, piv32), b_fp64.astype(np.float32)).astype(np.float64)
err0 = nla.norm(x_hat - x_true) / nla.norm(x_true)
print(f'fp32 solve error: {err0:.3e}')

# (c) Iterative refinement
errors = [err0]
x_ref = x_hat.copy()
for step in range(10):
    r = b_fp64 - A_fp64 @ x_ref  # Residual in fp64
    delta = sla.lu_solve((lu32, piv32), r.astype(np.float32)).astype(np.float64)
    x_ref += delta
    err = nla.norm(x_ref - x_true) / nla.norm(x_true)
    errors.append(err)
    if err < 1e-14: break

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(8, 4))
    ax.semilogy(errors, 'o-', color='#0077BB')
    ax.axhline(float(np.finfo(np.float64).eps), color='red', linestyle='--', label='fp64 eps')
    ax.axhline(float(np.finfo(np.float32).eps), color='gray', linestyle=':', label='fp32 eps')
    ax.set_xlabel('Refinement step'); ax.set_ylabel('Relative error')
    ax.set_title('Iterative refinement: fp32 factor + fp64 residual')
    ax.legend()
    plt.tight_layout(); plt.show()

print(f'After {len(errors)-1} refinement steps: error = {errors[-1]:.2e}')
ok = errors[-1] < errors[0] * 0.01  # Significant improvement
print(f"{'PASS' if ok else 'FAIL'} - iterative refinement improves accuracy")

Exercise 7 ★★ — Sparse System: 1D Poisson Equation

Solve the 1D Poisson equation u=f-u'' = f on [0,1][0, 1] with u(0)=u(1)=0u(0) = u(1) = 0.

The finite difference discretization gives the tridiagonal system:

Tnu=h2f,Tn=(2112112)T_n u = h^2 f, \quad T_n = \begin{pmatrix} 2 & -1 & & \\ -1 & 2 & -1 & \\ & \ddots & \ddots & \ddots \\ & & -1 & 2 \end{pmatrix}

where h=1/(n+1)h = 1/(n+1).

(a) Build TnT_n as a sparse matrix for n=500n = 500.

(b) Solve using dense numpy.linalg.solve and time it.

(c) Solve using scipy.sparse.linalg.spsolve and time it.

(d) Solve using scipy.sparse.linalg.cg and report iterations needed.

(e) Compare the solution against the true answer: if f=π2sin(πx)f = \pi^2 \sin(\pi x), then u=sin(πx)u = \sin(\pi x).

Code cell 23

# Your Solution
# Exercise 7: Sparse Poisson System

import time
try:
    import scipy.sparse as sp
    import scipy.sparse.linalg as spla
    HAS_SCIPY = True
except ImportError:
    HAS_SCIPY = False
    print('scipy not available')

if HAS_SCIPY:
    n = 500
    h = 1.0 / (n + 1)
    x_grid = np.linspace(h, 1-h, n)

    # YOUR CODE: build T_n as sparse matrix
    # f = pi^2 * sin(pi * x), true u = sin(pi * x)
    f = np.pi**2 * np.sin(np.pi * x_grid)
    u_true = np.sin(np.pi * x_grid)

    # YOUR CODE: solve three ways and compare

print("Learner scaffold loaded; write your solution here, then compare below.")

Code cell 24

# Solution
import time
try:
    import scipy.sparse as sp
    import scipy.sparse.linalg as spla
    HAS_SCIPY_SP = True
except ImportError:
    HAS_SCIPY_SP = False

if HAS_SCIPY_SP:
    n = 500
    h = 1.0 / (n + 1)
    x_grid = np.linspace(h, 1-h, n)
    f_rhs = np.pi**2 * np.sin(np.pi * x_grid) * h**2
    u_true = np.sin(np.pi * x_grid)

    # Build sparse T_n
    diag_main = 2 * np.ones(n)
    diag_off  = -np.ones(n - 1)
    T_sparse = sp.diags([diag_off, diag_main, diag_off], [-1, 0, 1],
                        shape=(n, n), format='csr')

    # Dense solve
    T_dense = T_sparse.toarray()
    t0 = time.perf_counter()
    u_dense = nla.solve(T_dense, f_rhs)
    t_dense = time.perf_counter() - t0

    # Sparse direct
    t0 = time.perf_counter()
    u_sparse = spla.spsolve(T_sparse, f_rhs)
    t_sparse = time.perf_counter() - t0

    # CG
    n_iter = [0]
    def callback(x): n_iter[0] += 1
    t0 = time.perf_counter()
    u_cg, info = spla.cg(T_sparse, f_rhs, rtol=1e-10, callback=callback)
    t_cg = time.perf_counter() - t0

    print(f'Results for n={n} Poisson system:')
    print(f'  Dense:   error={nla.norm(u_dense - u_true):.3e}, time={t_dense:.3f}s')
    print(f'  Sparse:  error={nla.norm(u_sparse - u_true):.3e}, time={t_sparse:.4f}s')
    print(f'  CG:      error={nla.norm(u_cg - u_true):.3e}, time={t_cg:.4f}s, iters={n_iter[0]}')
    print(f'  Speedup (dense/sparse): {t_dense/t_sparse:.1f}x')

    ok = all(nla.norm(u - u_true) < 1e-3
             for u in [u_dense, u_sparse, u_cg])
    print(f"{'PASS' if ok else 'FAIL'} - all methods converge to true solution")

Exercise 8 ★★★ — Adam as Diagonal Preconditioning

Demonstrate that Adam's update rule implements diagonal preconditioning.

(a) Implement gradient descent on a quadratic f(x)=12xHxf(x) = \frac{1}{2} x^\top H x where H=diag(1,100,1000,10000)H = \text{diag}(1, 100, 1000, 10000) (diagonal with large spread).

(b) Implement Adam's update rule: mt=β1mt1+(1β1)gtm_t = \beta_1 m_{t-1} + (1-\beta_1) g_t, vt=β2vt1+(1β2)gt2v_t = \beta_2 v_{t-1} + (1-\beta_2) g_t^2, xt=xt1αm^t/(v^t+ε)x_t = x_{t-1} - \alpha \hat{m}_t / (\sqrt{\hat{v}_t} + \varepsilon)

(c) Implement diagonal-preconditioned gradient descent: xxαH1gx \leftarrow x - \alpha H^{-1} g (using true diagonal Hessian).

(d) Compare convergence of GD, Adam, and diagonal-PCG. Show that Adam mimics the optimal diagonal preconditioner asymptotically.

Code cell 26

# Your Solution
# Exercise 8: Adam as Diagonal Preconditioning

# YOUR CODE: implement and compare GD, Adam, diagonal-preconditioned GD

print("Learner scaffold loaded; write your solution here, then compare below.")

Code cell 27

# Solution
# Quadratic: f(x) = 0.5 * x^T H x, grad = H x
h_diag = np.array([1.0, 100.0, 1000.0, 10000.0])
d = len(h_diag)
x0 = np.ones(d)
n_steps = 300

# === Gradient Descent ===
eta_gd = 2.0 / (h_diag.max() + h_diag.min())  # Optimal step
x_gd = x0.copy()
err_gd = [nla.norm(x_gd)]
for _ in range(n_steps):
    g = h_diag * x_gd  # Gradient of f
    x_gd -= eta_gd * g
    err_gd.append(nla.norm(x_gd))

# === Adam ===
beta1, beta2, eps = 0.9, 0.999, 1e-8
alpha_adam = 0.1
x_adam = x0.copy()
m = np.zeros(d)
v = np.zeros(d)
err_adam = [nla.norm(x_adam)]
for t in range(1, n_steps + 1):
    g = h_diag * x_adam
    m = beta1 * m + (1 - beta1) * g
    v = beta2 * v + (1 - beta2) * g**2
    m_hat = m / (1 - beta1**t)
    v_hat = v / (1 - beta2**t)
    x_adam -= alpha_adam * m_hat / (np.sqrt(v_hat) + eps)
    err_adam.append(nla.norm(x_adam))

# === Diagonal-preconditioned GD ===
eta_pc = 1.0  # Preconditioned: converges in 1 step ideally
x_pc = x0.copy()
err_pc = [nla.norm(x_pc)]
for _ in range(n_steps):
    g = h_diag * x_pc
    x_pc -= eta_pc * g / h_diag  # H^{-1} g = g / diag(H)
    err_pc.append(nla.norm(x_pc))

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.semilogy(err_gd, color='#CC3311', label=f'GD (κ={h_diag.max():.0f})')
    ax.semilogy(err_adam, color='#EE7733', label='Adam')
    ax.semilogy(err_pc, color='#0077BB', label='Diagonal-precond GD')
    ax.set_xlabel('Iteration'); ax.set_ylabel('||x|| (log scale)')
    ax.set_title('Adam mimics diagonal preconditioning')
    ax.legend()
    plt.tight_layout(); plt.show()

ok = err_adam[-1] < err_gd[-1]  # Adam should beat GD
print(f'GD final error:   {err_gd[-1]:.4f}')
print(f'Adam final error: {err_adam[-1]:.4f}')
print(f'PC GD final error:{err_pc[-1]:.4e}')
print(f"{'PASS' if ok else 'FAIL'} - Adam outperforms vanilla GD on ill-conditioned problem")

Summary

You have practiced:

  1. LU with pivoting — implementation, stability demonstration, comparison without pivoting
  2. Condition numbers — digit loss, building ill-conditioned matrices, empirical verification
  3. Normal equations vs QR — condition number squaring, accuracy comparison across κ
  4. Conjugate gradient — implementation, convergence bound, diagonal preconditioning
  5. Power iteration — convergence rate via spectral gap, Rayleigh quotient tracking
  6. Iterative refinement — fp32 factor + fp64 residual, achieving fp64 accuracy
  7. Sparse systems — 1D Poisson equation, sparse direct vs iterative solvers
  8. Adam as preconditioning — connecting optimizer theory to numerical linear algebra

Next: Numerical Optimization →

Exercise 9 *** - Iterative Refinement

Solve an ill-conditioned linear system in low precision, then use high-precision residuals to improve the solution.

Code cell 30

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 31

# Solution
np.random.seed(42)
U,_=np.linalg.qr(np.random.randn(6,6)); s=np.logspace(0,-5,6); A=U@np.diag(s)@U.T
x_true=np.ones(6); b=A@x_true
x=np.linalg.solve(A.astype(np.float32),b.astype(np.float32)).astype(float)
err0=np.linalg.norm(x-x_true)
for _ in range(5):
    r=b-A@x
    dx=np.linalg.solve(A.astype(np.float32),r.astype(np.float32)).astype(float)
    x=x+dx
err1=np.linalg.norm(x-x_true)
header('Exercise 9: Iterative Refinement')
print(f'initial error={err0:.3e}, refined error={err1:.3e}')
check_true('refinement reduces error', err1<err0)
print('Takeaway: high-precision residuals can recover accuracy from cheaper low-precision solves.')

Exercise 10 *** - Randomized Low-Rank Approximation

Use a randomized range finder to approximate a matrix with rapidly decaying singular values.

Code cell 33

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 34

# Solution
np.random.seed(42)
U,_=np.linalg.qr(np.random.randn(80,80)); V,_=np.linalg.qr(np.random.randn(50,50)); s=np.exp(-np.arange(50)/6)
A=U[:,:50]@np.diag(s)@V.T
k,p=8,5
Omega=np.random.randn(A.shape[1],k+p)
Q,_=np.linalg.qr(A@Omega)
B=Q.T@A
Ub,sb,Vtb=np.linalg.svd(B,full_matrices=False)
Ahat=Q@Ub[:,:k]@np.diag(sb[:k])@Vtb[:k]
err=np.linalg.norm(A-Ahat,'fro')
best=np.sqrt((s[k:]**2).sum())
header('Exercise 10: Randomized Low-Rank Approximation')
print(f'error={err:.4e}, best rank-{k} tail={best:.4e}')
check_true('randomized approximation is near the optimal tail scale', err<2.5*best)
print('Takeaway: randomized SVD is a core scalable primitive for large embedding and feature matrices.')