Theory Notebook

Converted from theory.ipynb for web reading.

Floating-Point Arithmetic — Theory Notebook

"The floating-point number system is a finite subset of the reals, and this finiteness has consequences."

Interactive derivations covering: IEEE 754 encoding, machine epsilon, catastrophic cancellation, Kahan summation, condition numbers, stable algorithms (log-sum-exp, softmax), and mixed-precision training.

Companion: notes.md | exercises.ipynb

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import struct
import sys

try:
    import matplotlib.pyplot as plt
    import matplotlib as mpl
    try:
        import seaborn as sns
        sns.set_theme(style='whitegrid', palette='colorblind')
        HAS_SNS = True
    except ImportError:
        plt.style.use('seaborn-v0_8-whitegrid')
        HAS_SNS = False
    mpl.rcParams.update({
        'figure.figsize': (10, 6), 'figure.dpi': 120,
        'font.size': 13, 'axes.titlesize': 15, 'axes.labelsize': 13,
        'xtick.labelsize': 11, 'ytick.labelsize': 11,
        'legend.fontsize': 11, 'lines.linewidth': 2.0,
        'axes.spines.top': False, 'axes.spines.right': False,
    })
    HAS_MPL = True
except ImportError:
    HAS_MPL = False

COLORS = {
    'primary':   '#0077BB',
    'secondary': '#EE7733',
    'tertiary':  '#009988',
    'error':     '#CC3311',
    'neutral':   '#555555',
    'highlight': '#EE3377',
}

np.set_printoptions(precision=15, suppress=False)
np.random.seed(42)
print('Setup complete.')
print(f'NumPy version: {np.__version__}')
print(f'Python float: {sys.float_info}')

print("Chapter helper setup complete.")

1. IEEE 754 Bit-Level Encoding

We decode the actual bits of floating-point numbers to understand the standard directly.

Code cell 5

# === 1.1 IEEE 754 Bit Decomposition ===

def fp32_bits(x):
    """Return the 32-bit IEEE 754 representation of a float32."""
    packed = struct.pack('>f', np.float32(x))
    bits = int.from_bytes(packed, 'big')
    sign = (bits >> 31) & 1
    exp  = (bits >> 23) & 0xFF
    mant = bits & 0x7FFFFF
    return sign, exp, mant, f'{bits:032b}'

def decode_fp32(x):
    """Decode and print fp32 number."""
    sign, exp, mant, bstr = fp32_bits(x)
    print(f'Value: {np.float32(x)}')
    print(f'Bits:  {bstr[0]} {bstr[1:9]} {bstr[9:]}')
    print(f'       sign  exponent  mantissa')
    if exp == 0 and mant == 0:
        print(f'       ({["+","-"][sign]}0  — signed zero)')
    elif exp == 255 and mant == 0:
        print(f'       ({["+","-"][sign]}Infinity)')
    elif exp == 255:
        print(f'       (NaN)')
    elif exp == 0:
        val = (-1)**sign * mant * 2**(-149)
        print(f'       Subnormal: (-1)^{sign} * 0.{mant:023b} * 2^(-126) = {val:.6e}')
    else:
        val = (-1)**sign * (1 + mant/2**23) * 2**(exp - 127)
        print(f'       Normal: (-1)^{sign} * 1.{mant:023b} * 2^({exp}-127) = {val:.6e}')
    print()

print('=== Decoding key fp32 values ===')
for x in [1.0, -1.0, 0.5, 3.14, 0.0, float('inf'), float('nan')]:
    decode_fp32(x)

Code cell 6

# === 1.2 Computing Machine Epsilon from First Principles ===

# Method 1: Iterate by halving
eps_fp64 = np.float64(1.0)
while np.float64(1.0) + eps_fp64 / 2 > np.float64(1.0):
    eps_fp64 /= 2

eps_fp32 = np.float32(1.0)
while np.float32(1.0) + np.float32(eps_fp32) / 2 > np.float32(1.0):
    eps_fp32 = np.float32(eps_fp32) / np.float32(2.0)

print('Machine epsilon by iteration:')
print(f'  fp64: {eps_fp64:.20e}  (theory: 2^-52 = {2**-52:.20e})')
print(f'  fp32: {eps_fp32:.10e}  (theory: 2^-23 = {2**-23:.10e})')
print()

# Method 2: NumPy info
print('NumPy machine epsilon:')
print(f'  fp64: {np.finfo(np.float64).eps:.20e}')
print(f'  fp32: {np.finfo(np.float32).eps:.10e}')
print(f'  fp16: {np.finfo(np.float16).eps:.6e}')
print()

# Verify: 1.0 + eps is representable, 1.0 + eps/2 is not
ok = (np.float32(1.0) + eps_fp32 > np.float32(1.0)) and \
     (np.float32(1.0) + eps_fp32/2 == np.float32(1.0))
print(f"{'PASS' if ok else 'FAIL'} - machine epsilon verified for fp32")

Code cell 7

# === 1.3 Density of Representable Numbers ===

# Near 1.0, spacing between consecutive fp32 numbers = eps_mach
# Near 2.0, spacing = 2 * eps_mach, etc.

def fp32_ulp(x):
    """Unit in the last place for fp32."""
    x = np.float32(x)
    _, exp, _, _ = fp32_bits(x)
    if exp == 0:  # subnormal
        return np.float32(2**-149)
    return np.float32(2**(exp - 127 - 23))

print('ULP (spacing to next fp32) at key values:')
for x in [0.5, 1.0, 2.0, 4.0, 1e6, 1e10, 65504.0]:
    ulp = fp32_ulp(x)
    print(f'  x = {x:12.1f}:  ULP = {ulp:.3e}  '
          f'(relative: {ulp/x:.3e})')

print()
print(f'fp32 max value: {np.finfo(np.float32).max:.6e}')
print(f'fp16 max value: {np.finfo(np.float16).max:.1f}')
print(f'fp32 min normal: {np.finfo(np.float32).tiny:.6e}')
print(f'fp32 min subnormal: {np.nextafter(np.float32(0), np.float32(1)):.6e}')

2. Rounding Error and Catastrophic Cancellation

We demonstrate that subtraction of nearly-equal numbers can destroy all significant digits.

Code cell 9

# === 2.1 Catastrophic Cancellation ===

# Example: compute 1 - cos(x) for small x
# True value ≈ x^2/2 for small x

x_vals = np.logspace(-1, -8, 50)  # From 0.1 to 1e-8
true_val = x_vals**2 / 2  # Leading-order Taylor approximation

# Method 1: Direct (catastrophic cancellation for small x)
direct_fp32 = np.float32(1.0) - np.cos(x_vals.astype(np.float32))
direct_fp64 = 1.0 - np.cos(x_vals)

# Method 2: Stable (use 2*sin^2(x/2) = 1 - cos(x))
stable_fp32 = 2 * np.sin(x_vals.astype(np.float32)/2)**2

# Relative errors
# Use fp64 direct as reference for fp64, and stable fp32 as reference for fp32
ref = 2 * np.sin(x_vals/2)**2  # Stable fp64 as ground truth
err_direct32 = np.abs(direct_fp32 - ref) / (ref + 1e-300)
err_stable32 = np.abs(stable_fp32 - ref) / (ref + 1e-300)
err_direct64 = np.abs(direct_fp64 - ref) / (ref + 1e-300)

print('Relative error in computing 1 - cos(x):')
print(f'{"x":>12}  {"Direct fp32":>14}  {"Stable fp32":>14}  {"Direct fp64":>14}')
for i in [0, 10, 20, 30, 40, 45]:
    print(f'{x_vals[i]:12.2e}  {err_direct32[i]:14.3e}  '
          f'{err_stable32[i]:14.3e}  {err_direct64[i]:14.3e}')

Code cell 10

# === 2.2 Visualization: Catastrophic Cancellation ===

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    # Left: Relative error vs x
    ax = axes[0]
    ax.loglog(x_vals, err_direct32 + 1e-16,
              color=COLORS['error'], label='Direct fp32 (1 - cos x)', linewidth=2)
    ax.loglog(x_vals, err_stable32 + 1e-16,
              color=COLORS['primary'], label='Stable fp32 (2 sin²(x/2))', linewidth=2)
    ax.loglog(x_vals, err_direct64 + 1e-16,
              color=COLORS['secondary'], label='Direct fp64', linewidth=2, linestyle='--')
    ax.axhline(np.finfo(np.float32).eps, color=COLORS['neutral'],
               linestyle=':', label='fp32 ε_mach')
    ax.set_xlabel('x')
    ax.set_ylabel('Relative error')
    ax.set_title('Catastrophic cancellation: 1 - cos(x)')
    ax.legend(fontsize=10)

    # Right: Significant digits lost
    ax = axes[1]
    digits_direct = -np.log10(err_direct32 + 1e-16)
    digits_stable = -np.log10(err_stable32 + 1e-16)
    ax.semilogx(x_vals, np.clip(digits_direct, 0, 8),
                color=COLORS['error'], label='Direct fp32', linewidth=2)
    ax.semilogx(x_vals, np.clip(digits_stable, 0, 8),
                color=COLORS['primary'], label='Stable fp32', linewidth=2)
    ax.axhline(7, color=COLORS['neutral'], linestyle=':', label='fp32 max digits (~7)')
    ax.set_xlabel('x')
    ax.set_ylabel('Significant digits correct')
    ax.set_title('Significant digits: stable vs. direct computation')
    ax.legend()

    fig.tight_layout()
    plt.show()
    print('Plot displayed.')

Code cell 11

# === 2.3 Quadratic Formula Cancellation ===

# For x^2 - 56x + 1 = 0, roots are ~55.982 and ~0.01786
# The standard formula loses precision for the small root

a, b, c = np.float64(1.0), np.float64(-56.0), np.float64(1.0)
disc = np.sqrt(b**2 - 4*a*c)

# Standard formula
r1_standard = (-b + disc) / (2*a)
r2_standard = (-b - disc) / (2*a)  # Small root — catastrophic cancellation!

# Stable formula for small root: c / (a * r1)
r1_stable = (-b + disc) / (2*a)        # Large root — accurate
r2_stable = c / (a * r1_stable)        # Small root via Vieta's formula

# True values (computed symbolically)
r1_true = 28 + np.sqrt(783)  # = 28 + 27.982...
r2_true = 28 - np.sqrt(783)  # = 28 - 27.982... = 0.01786...

print('Roots of x^2 - 56x + 1 = 0:')
print(f'True r1 = {r1_true:.15f}')
print(f'True r2 = {r2_true:.15f}')
print()
print(f'Standard r1 = {r1_standard:.15f}  error: {abs(r1_standard - r1_true):.2e}')
print(f'Standard r2 = {r2_standard:.15f}  error: {abs(r2_standard - r2_true):.2e}')
print()
print(f'Stable r1   = {r1_stable:.15f}  error: {abs(r1_stable - r1_true):.2e}')
print(f'Stable r2   = {r2_stable:.15f}  error: {abs(r2_stable - r2_true):.2e}')

rel_err_standard = abs(r2_standard - r2_true) / r2_true
rel_err_stable   = abs(r2_stable   - r2_true) / r2_true
print()
print(f'Relative error (standard): {rel_err_standard:.2e}')
print(f'Relative error (stable):   {rel_err_stable:.2e}')
ok = rel_err_stable < 1e-12 and rel_err_standard > 1e-6
print(f"{'PASS' if ok else 'FAIL'} - stable formula avoids cancellation")

3. Kahan Compensated Summation

Demonstrate that naive summation error grows with $n$ , while Kahan achieves $O(\varepsilon_{\text{mach}})$ error.

Code cell 13

# === 3.1 Naive vs Kahan Summation Error ===

def naive_sum_fp32(arr):
    s = np.float32(0.0)
    for x in arr:
        s += np.float32(x)
    return s

def kahan_sum_fp32(arr):
    s = np.float32(0.0)
    c = np.float32(0.0)  # Compensation
    for x in arr:
        y = np.float32(x) - c
        t = s + y
        c = (t - s) - y  # Captured rounding error
        s = t
    return s

# Sum of n copies of a small value epsilon where n*eps is O(1)
# True sum = 1.0
eps_val = np.float32(1e-7)  # Near machine epsilon
n_vals  = [10, 100, 1000, 10000, 100000]

print(f'Summing n copies of {eps_val}')
print(f'{"n":>8}  {"Naive fp32":>14}  {"Kahan fp32":>14}  {"NumPy fp64":>14}  {"True":>14}')

for n in n_vals:
    arr = np.full(n, eps_val, dtype=np.float32)
    true_val = float(eps_val) * n
    naive = float(naive_sum_fp32(arr))
    kahan = float(kahan_sum_fp32(arr))
    fp64  = float(np.sum(arr.astype(np.float64)))
    print(f'{n:8d}  {naive:14.8f}  {kahan:14.8f}  {fp64:14.8f}  {true_val:14.8f}')

Code cell 14

# === 3.2 Error Growth with n ===

n_trials = list(range(1, 201))
naive_errors = []
kahan_errors = []

# Sum of n random fp32 values — compare to fp64 sum as reference
np.random.seed(42)
base_arr = np.random.uniform(1.0, 2.0, 200).astype(np.float32)

for n in n_trials:
    arr = base_arr[:n]
    true_val = float(np.sum(arr.astype(np.float64)))  # fp64 as ground truth
    naive = float(naive_sum_fp32(arr))
    kahan = float(kahan_sum_fp32(arr))
    naive_errors.append(abs(naive - true_val) / abs(true_val))
    kahan_errors.append(abs(kahan - true_val) / abs(true_val) + 1e-20)

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.semilogy(n_trials, naive_errors, color=COLORS['error'], label='Naive fp32 sum')
    ax.semilogy(n_trials, kahan_errors, color=COLORS['primary'], label='Kahan fp32 sum')
    ax.axhline(float(np.finfo(np.float32).eps), color=COLORS['neutral'],
               linestyle='--', label='fp32 ε_mach')
    ax.set_xlabel('Number of elements n')
    ax.set_ylabel('Relative error (log scale)')
    ax.set_title('Error growth: Naive vs Kahan summation')
    ax.legend()
    plt.tight_layout()
    plt.show()
    print('Plot displayed.')

# Verify Kahan error stays near machine epsilon
eps_mach = float(np.finfo(np.float32).eps)
ok = max(kahan_errors) < 10 * eps_mach
print(f"{'PASS' if ok else 'FAIL'} - Kahan error bounded by ~10*eps_mach")
print(f'Max Kahan error: {max(kahan_errors):.2e}  (eps_mach = {eps_mach:.2e})')

4. Condition Numbers and Error Amplification

Demonstrate how the condition number $\kappa(A) = \sigma_{\max}/\sigma_{\min}$ bounds forward error.

Code cell 16

# === 4.1 Condition Number of Scalar Functions ===

# For f(x) = sqrt(x), cond = |x f'(x)| / |f(x)| = 1/2
# For f(x) = 1 - x near x=1, cond = x / (1-x) -> infinity as x -> 1

x_vals = np.linspace(0.01, 0.999, 300)

# Condition number of subtraction f(x) = 1 - x near x = 1
# cond = |x| / |1 - x|
cond_subtract = np.abs(x_vals) / np.abs(1 - x_vals)

# Condition number of sqrt: 1/2
cond_sqrt = np.full_like(x_vals, 0.5)

# Condition number of exp: |x|
cond_exp = np.abs(x_vals)

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.semilogy(x_vals, cond_subtract, color=COLORS['error'], label='1 - x (cancellation)')
    ax.semilogy(x_vals, cond_exp, color=COLORS['secondary'], label='exp(x)')
    ax.axhline(0.5, color=COLORS['primary'], linestyle='--', label='sqrt(x) (cond = 0.5)')
    ax.set_xlabel('x')
    ax.set_ylabel('Condition number (log scale)')
    ax.set_title('Condition numbers of scalar functions')
    ax.legend()
    plt.tight_layout()
    plt.show()
    print('Plot displayed.')

x = np.float64(0.9999)
cond = abs(x) / abs(1 - x)
eps_mach = np.finfo(np.float64).eps
print(f'At x = 0.9999: cond(1-x) = {cond:.1f}')
print(f'Expected relative error in (1-x): ~{cond * eps_mach:.2e}')
print(f'Actual relative error: {abs((1-x) - (1-x))/(1-x):.2e} (exact here)')

Code cell 17

# === 4.2 Matrix Condition Number and Linear System Sensitivity ===

import numpy.linalg as nla

def build_hilbert(n):
    """Hilbert matrix H_{ij} = 1/(i+j-1), notoriously ill-conditioned."""
    i, j = np.meshgrid(np.arange(1, n+1), np.arange(1, n+1), indexing='ij')
    return 1.0 / (i + j - 1)

print('Condition numbers of Hilbert matrices:')
print(f'{"n":>4}  {"kappa(H_n)":>16}  {"log10(kappa)":>12}')
for n in range(2, 13):
    H = build_hilbert(n)
    kappa = nla.cond(H)
    print(f'{n:4d}  {kappa:16.4e}  {np.log10(kappa):12.1f}')

print()
print('As n increases, Hilbert matrix becomes exponentially ill-conditioned.')
print(f'For n=10, kappa ~ 10^13, so 13 digits of x are lost in solving Hx=b with fp64 (15 digits).')

Code cell 18

# === 4.3 Perturbation Test: Forward Error Bound ===

# Solve Hx = b for two right-hand sides that differ by a tiny perturbation
# Forward error ~ kappa(H) * relative perturbation

n = 8
H = build_hilbert(n)
kappa = nla.cond(H)

# True solution: all ones
x_true = np.ones(n)
b = H @ x_true

# Perturbed RHS
np.random.seed(0)
delta_b = np.random.randn(n) * 1e-10  # tiny perturbation
b_perturbed = b + delta_b

# Solve both systems
x_computed  = nla.solve(H, b)
x_perturbed = nla.solve(H, b_perturbed)

rel_perturb_b = nla.norm(delta_b) / nla.norm(b)
rel_error_x   = nla.norm(x_perturbed - x_computed) / nla.norm(x_computed)
amplification = rel_error_x / rel_perturb_b

print(f'Hilbert matrix H_{n}: kappa = {kappa:.3e}')
print(f'Relative perturbation in b: {rel_perturb_b:.2e}')
print(f'Resulting relative error in x: {rel_error_x:.2e}')
print(f'Error amplification factor: {amplification:.2e}')
print(f'Predicted bound (kappa): {kappa:.2e}')

ok = amplification <= kappa * 10  # Within order of magnitude
print(f"{'PASS' if ok else 'FAIL'} - amplification bounded by kappa")

5. Stable Algorithms for AI

Implement and verify log-sum-exp, stable softmax, and stable cross-entropy.

Code cell 20

# === 5.1 Log-Sum-Exp Stability ===

def logsumexp_naive(x):
    """Naive implementation — overflows for large x."""
    return np.log(np.sum(np.exp(x)))

def logsumexp_stable(x):
    """Stable implementation using shift by max."""
    c = np.max(x)
    return c + np.log(np.sum(np.exp(x - c)))

# Test with large values (overflow territory for fp32)
x_large = np.array([100.0, 200.0, 300.0, 400.0], dtype=np.float32)
x_small = np.array([-1000.0, -2000.0, -500.0], dtype=np.float32)

print('Large values (fp32):')
print(f'  naive    = {logsumexp_naive(x_large)} (Inf — overflow!)')
print(f'  stable   = {logsumexp_stable(x_large):.6f}')
print(f'  scipy    = {float(np.log(np.sum(np.exp(x_large.astype(np.float64))))):.6f} (fp64 ref)')

print()
print('Small values (fp32):')
print(f'  naive    = {logsumexp_naive(x_small)}')
print(f'  stable   = {logsumexp_stable(x_small):.6f}')

# Verify stable equals fp64 computation
x_test = np.array([1.0, 2.0, 3.0], dtype=np.float64)
from_scipy = np.log(np.sum(np.exp(x_test)))
from_stable = logsumexp_stable(x_test)
ok = np.isclose(from_scipy, from_stable)
print(f"\n{'PASS' if ok else 'FAIL'} - stable logsumexp matches reference")

Code cell 21

# === 5.2 Stable Softmax ===

def softmax_naive(x):
    e = np.exp(x)
    return e / e.sum()

def softmax_stable(x):
    e = np.exp(x - np.max(x))
    return e / e.sum()

# Test: large logits (typical in early training)
logits_large = np.array([1000.0, 1001.0, 999.0], dtype=np.float32)

print('Softmax with large logits (fp32):')
try:
    naive_result = softmax_naive(logits_large)
    print(f'  Naive: {naive_result}  (NaN from overflow!)')
except:
    print('  Naive: RuntimeError from overflow')

stable_result = softmax_stable(logits_large)
print(f'  Stable: {stable_result}')
print(f'  Sum: {stable_result.sum():.6f} (should be 1.0)')

# Verify: softmax([1000, 1001, 999]) = softmax([0, 1, -1])
reference = softmax_stable(np.array([0.0, 1.0, -1.0]))
ok = np.allclose(stable_result, reference, atol=1e-5)
print(f"{'PASS' if ok else 'FAIL'} - stable softmax equals shifted version")

print()
# Demonstrate across a range of magnitudes
print('Stable softmax properties:')
for scale in [1, 10, 100, 1000]:
    z = np.array([scale, scale+1, scale-1], dtype=np.float64)
    s = softmax_stable(z)
    print(f'  scale={scale:4d}: {s.round(4)}, sum={s.sum():.6f}')

Code cell 22

# === 5.3 Stable Cross-Entropy Loss ===

def cross_entropy_naive(logits, target_idx):
    """Naive: compute softmax then log."""
    probs = softmax_naive(logits)
    return -np.log(probs[target_idx])

def cross_entropy_stable(logits, target_idx):
    """Stable: log-softmax = logit - log-sum-exp."""
    log_softmax = logits - logsumexp_stable(logits)
    return -log_softmax[target_idx]

# Test case: model very confident about wrong class
# logits: class 0 gets 1000, target is class 1
logits = np.array([1000.0, 2.0, 1.0], dtype=np.float32)
target = 1

print('Cross-entropy with extreme logits:')
try:
    loss_naive = cross_entropy_naive(logits, target)
    print(f'  Naive: {loss_naive:.6f}')
except:
    print('  Naive: Failed (NaN/Inf)')

loss_stable = cross_entropy_stable(logits, target)
print(f'  Stable: {loss_stable:.6f}')

# Check: log softmax = logit - log(sum exp)
# For target=1: -log_softmax[1] = -logit[1] + logsumexp = -(2 - 1000) = 998
# Because softmax(1) ≈ exp(2-1000) / (exp(0) + 1 + ...) ≈ exp(-998)
expected_approx = 1000.0 - 2.0  # Approximately
print(f'  Expected (approx): {expected_approx:.1f}')
print(f'  Computed:          {loss_stable:.6f}')

ok = abs(loss_stable - expected_approx) < 5.0  # Within reasonable tolerance
print(f"{'PASS' if ok else 'FAIL'} - stable cross-entropy gives sensible result")

6. Special Values: NaN, Infinity, and Signed Zero

Explore how IEEE 754 special values propagate through computations.

Code cell 24

# === 6.1 NaN Propagation ===

nan = np.float32('nan')
inf = np.float32('inf')
zero = np.float32(0.0)

print('NaN arithmetic rules (IEEE 754):')
ops = [
    ('nan + 1', nan + 1),
    ('nan * 0', nan * zero),
    ('nan * inf', nan * inf),
    ('0 / 0', zero / zero),
    ('inf - inf', inf - inf),
    ('inf * 0', inf * zero),
]
for name, val in ops:
    print(f'  {name} = {val}')

print()
print('Infinity arithmetic:')
ops_inf = [
    ('inf + 1', inf + 1),
    ('inf + inf', inf + inf),
    ('1 / 0', np.float32(1.0) / zero),
    ('1 / inf', np.float32(1.0) / inf),
    ('-1 / 0', np.float32(-1.0) / zero),
]
for name, val in ops_inf:
    print(f'  {name} = {val}')

print()
print('NaN detection (NaN != NaN is True):')
print(f'  nan == nan:  {nan == nan}')
print(f'  nan != nan:  {nan != nan}')
print(f'  np.isnan(nan): {np.isnan(nan)}')

ok = np.isnan(nan) and not (nan == nan)
print(f"{'PASS' if ok else 'FAIL'} - NaN properties verified")

Code cell 25

# === 6.2 Signed Zero ===

pos_zero = np.float32(+0.0)
neg_zero = np.float32(-0.0)

print('Signed zero properties:')
print(f'  +0.0 == -0.0: {pos_zero == neg_zero}')
print(f'  1 / (+0.0) = {np.float32(1.0) / pos_zero}')
print(f'  1 / (-0.0) = {np.float32(1.0) / neg_zero}')
print(f'  str(+0.0) = {pos_zero}')
print(f'  str(-0.0) = {neg_zero}')

# Bit difference
_, _, _, pbits = fp32_bits(+0.0)
_, _, _, nbits = fp32_bits(-0.0)
print(f'  +0.0 bits: {pbits}')
print(f'  -0.0 bits: {nbits}')

print()
print('Practical impact: relu gradient at x=0')
# relu(x) = max(0, x)
# At x=0, the gradient is conventionally 0 (subgradient)
# Some implementations return +0, others -0
x = np.float32(0.0)
relu_at_zero = np.maximum(np.float32(0.0), x)
print(f'  relu(0) = {relu_at_zero}')
print(f'  relu(0) is pos zero: {not np.signbit(relu_at_zero)}')

7. Numerical Formats for AI: Mixed Precision Simulation

Simulate mixed-precision training dynamics and quantization effects.

Code cell 27

# === 7.1 Precision Comparison: Format Ranges ===

formats = {
    'fp64': np.float64,
    'fp32': np.float32,
    'fp16': np.float16,
}

print(f'{"Format":>8}  {"eps_mach":>14}  {"max":>12}  {"min_normal":>12}')
for name, dtype in formats.items():
    info = np.finfo(dtype)
    print(f'{name:>8}  {info.eps:14.3e}  {info.max:12.3e}  {info.tiny:12.3e}')

# bf16: same exponent as fp32, fewer mantissa bits
# bf16 eps = 2^-7 ≈ 7.8e-3, max ≈ 3.4e38 (same as fp32)
bf16_eps = 2**-7
bf16_max = (2 - 2**-7) * 2**127
print(f'{"bf16":>8}  {bf16_eps:14.3e}  {bf16_max:12.3e}  {"~1.2e-38":>12}')

print()
print('Key insight: bf16 has same RANGE as fp32 but much lower PRECISION.')
print('fp16 has better precision than bf16 but much smaller range.')

Code cell 28

# === 7.2 Simulating bf16 Quantization ===

def to_bf16(x):
    """Simulate bf16 by truncating fp32 mantissa to 7 bits."""
    x32 = np.float32(x)
    # Reinterpret as uint32, zero out lower 16 bits
    bits = np.frombuffer(x32.tobytes(), dtype=np.uint32)[0]
    bf16_bits = bits & 0xFFFF0000  # Keep upper 16 bits (sign, exp, 7 mantissa)
    return np.frombuffer(np.uint32(bf16_bits).tobytes(), dtype=np.float32)[0]

# Test quantization error
test_vals = np.array([0.1, 0.5, 1.0, 3.14159, 100.0, 0.001], dtype=np.float32)

print(f'{"Value":>12}  {"bf16 approx":>14}  {"rel error":>12}')
for v in test_vals:
    v_bf16 = to_bf16(v)
    rel_err = abs(v_bf16 - v) / abs(v)
    print(f'{v:12.6f}  {v_bf16:14.6f}  {rel_err:12.4e}')

print()
# bf16 eps_mach ≈ 7.8e-3 so relative errors should be at most this
bf16_eps = 2**-7
print(f'bf16 machine epsilon: {bf16_eps:.4e}')
ok = all(abs(to_bf16(v) - v) / abs(v) <= 2 * bf16_eps for v in test_vals)
print(f"{'PASS' if ok else 'FAIL'} - bf16 quantization errors within 2*eps_mach")

Code cell 29

# === 7.3 Loss Scaling Simulation ===

# Simulate mixed-precision training with dynamic loss scaling
# Scenario: gradients in range [1e-4, 1e-3] — close to fp16 underflow

np.random.seed(42)
n_params = 1000
n_steps = 50

# Simulate training without and with loss scaling
def simulate_step(grads_fp32, scale=1.0, fp16_max=65504.0):
    """Simulate one step of fp16 gradient."""
    scaled_grads = grads_fp32 * scale
    # Check overflow
    overflow = np.any(np.abs(scaled_grads) > fp16_max)
    if overflow:
        return None, True  # Skip step
    # Quantize to fp16 precision
    grads_fp16 = scaled_grads.astype(np.float16).astype(np.float32)
    grads_unscaled = grads_fp16 / scale
    return grads_unscaled, False

# True gradients: small magnitude, requires scaling to survive fp16
true_grads = np.random.uniform(5e-5, 5e-4, n_params)  # Small gradients

# No scaling
g_no_scale, ovf = simulate_step(true_grads, scale=1.0)
n_underflow_no = np.sum(g_no_scale == 0) if g_no_scale is not None else 0

# With loss scaling (S = 1024)
g_scaled, ovf = simulate_step(true_grads, scale=1024.0)
n_underflow_s = np.sum(g_scaled == 0) if g_scaled is not None else 0

print('Gradient survival with fp16:')
print(f'  True gradient range: [{true_grads.min():.2e}, {true_grads.max():.2e}]')
print(f'  fp16 min subnormal:  {np.finfo(np.float16).tiny:.2e}')
print()
print(f'  No scaling:   {n_underflow_no}/{n_params} gradients underflow to zero')
if g_no_scale is not None:
    print(f'               Effective gradient: [{g_no_scale.min():.2e}, {g_no_scale.max():.2e}]')
print()
print(f'  Scale=1024:   {n_underflow_s}/{n_params} gradients underflow to zero')
if g_scaled is not None:
    print(f'               Effective gradient: [{g_scaled.min():.2e}, {g_scaled.max():.2e}]')

ok = n_underflow_no > n_underflow_s
print(f"{'PASS' if ok else 'FAIL'} - loss scaling reduces gradient underflow")

8. Stable Elementary Functions: expm1 and log1p

For arguments near zero, $e^x - 1$ and $\ln(1+x)$ require special implementations.

Code cell 31

# === 8.1 expm1 vs exp-1 ===

# For small x: exp(x) - 1 suffers catastrophic cancellation
# expm1(x) uses the identity: expm1(x) = x + x^2/2 + x^3/6 + ...

x_small = np.logspace(-15, -1, 100)

naive_result  = np.exp(x_small) - 1.0
stable_result = np.expm1(x_small)

# Reference: Taylor series (exact for tiny x)
# expm1(x) ≈ x + x^2/2 + x^3/6 for |x| << 1
taylor_ref = x_small + x_small**2/2 + x_small**3/6

# Relative errors
err_naive  = np.abs(naive_result  - taylor_ref) / (taylor_ref + 1e-300)
err_stable = np.abs(stable_result - taylor_ref) / (taylor_ref + 1e-300)

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.loglog(x_small, err_naive,  color=COLORS['error'],   label='exp(x) - 1 (naive)')
    ax.loglog(x_small, err_stable + 1e-17, color=COLORS['primary'], label='expm1(x) (stable)')
    ax.axhline(np.finfo(np.float64).eps, color=COLORS['neutral'],
               linestyle='--', label='fp64 ε_mach')
    ax.set_xlabel('x')
    ax.set_ylabel('Relative error')
    ax.set_title('expm1 vs exp(x)-1 for small x')
    ax.legend()
    plt.tight_layout()
    plt.show()
    print('Plot displayed.')

print('Comparison at x = 1e-10:')
x = 1e-10
print(f'  exp(x) - 1 = {np.exp(x) - 1:.20e}')
print(f'  expm1(x)   = {np.expm1(x):.20e}')
print(f'  x (leading term) = {x:.20e}')

Code cell 32

# === 8.2 log1p vs log(1+x) ===

x_small = np.logspace(-15, -1, 100)

naive_log  = np.log(1 + x_small)
stable_log = np.log1p(x_small)
taylor_ref = x_small - x_small**2/2 + x_small**3/3  # Taylor series

err_naive  = np.abs(naive_log  - taylor_ref) / (np.abs(taylor_ref) + 1e-300)
err_stable = np.abs(stable_log - taylor_ref) / (np.abs(taylor_ref) + 1e-300)

print('log1p precision at small x:')
for x in [1e-15, 1e-12, 1e-8, 1e-4, 1e-2]:
    naive = np.log(1.0 + x)
    stable = np.log1p(x)
    true_approx = x  # Leading term
    print(f'  x={x:.0e}: log(1+x)={naive:.15e}  log1p(x)={stable:.15e}')

print()
# Application: Information-theoretic log-likelihood
# -log(1 - p) for p near 0 (sparse attention weights)
p = 1e-12
print(f'Negative log-likelihood at p={p:.0e}:')
print(f'  -log(1-p) naive  = {-np.log(1-p):.15e}')
print(f'  -log1p(-p) stable = {-np.log1p(-p):.15e}')

9. Floating-Point Number Line

Visualize the non-uniform distribution of representable numbers.

Code cell 34

# === 9.1 Floating-Point Number Line Density ===

# For a tiny 4-bit float (sign=1, exp=2, mantissa=1)
# to illustrate the structure

# Generate all positive fp16 values
all_bits = np.arange(0, 0x7C00, dtype=np.uint16)  # Exclude inf/NaN
# Convert to fp16
fp16_vals = np.frombuffer(all_bits.tobytes(), dtype=np.float16)
fp16_vals = fp16_vals.astype(np.float64)

print(f'Total positive finite fp16 values: {len(fp16_vals)}')
print(f'Range: [{fp16_vals.min():.4e}, {fp16_vals.max():.4e}]')

if HAS_MPL:
    fig, axes = plt.subplots(2, 1, figsize=(14, 8))

    # Top: all values on number line
    ax = axes[0]
    # Sample for clarity
    sample = fp16_vals[::64]
    ax.scatter(sample, np.zeros_like(sample), s=3,
               alpha=0.5, color=COLORS['primary'])
    ax.set_xlabel('Value')
    ax.set_yticks([])
    ax.set_title('Positive fp16 values (sampled): denser near zero')
    ax.set_xlim(-1, 70)

    # Bottom: gap sizes vs value (log-log shows linear relationship)
    ax = axes[1]
    gaps = np.diff(fp16_vals[::8])
    midpoints = (fp16_vals[::8][:-1] + fp16_vals[::8][1:]) / 2
    ok_mask = gaps > 0
    ax.loglog(midpoints[ok_mask], gaps[ok_mask],
              color=COLORS['primary'], linewidth=1.5)
    ax.set_xlabel('Value (log scale)')
    ax.set_ylabel('Gap to next fp16 (log scale)')
    ax.set_title('fp16 spacing: doubles with each power of 2')

    fig.tight_layout()
    plt.show()
    print('Plot displayed.')

# Verify: relative spacing is constant = eps_mach within each binade
eps16 = float(np.finfo(np.float16).eps)
ok = True
for v in [0.5, 1.0, 2.0, 4.0]:
    v16 = np.float16(v)
    next_v16 = np.nextafter(v16, np.float16(float('inf')))
    gap = float(next_v16) - float(v16)
    rel_gap = gap / float(v16)
    ok = ok and np.isclose(rel_gap, eps16, rtol=0.1)
    print(f'  v={float(v16):.1f}: gap={gap:.4e}, relative={rel_gap:.4e} (eps={eps16:.4e})')
print(f"{'PASS' if ok else 'FAIL'} - relative spacing equals eps_mach")

10. Rounding Modes

IEEE 754 defines five rounding modes. Round-to-nearest-even is the default.

Code cell 36

# === 10.1 Rounding Mode Comparison ===

# Demonstrate rounding modes using exact arithmetic via Python Fraction
from fractions import Fraction

def round_to_nearest_even(x, eps):
    """Round x to nearest multiple of eps, ties go to even."""
    quotient = x / eps
    lower = int(quotient)  # floor
    upper = lower + 1
    if quotient - lower < 0.5:
        return lower * eps
    elif quotient - lower > 0.5:
        return upper * eps
    else:  # Tie: pick even
        return (lower if lower % 2 == 0 else upper) * eps

# Compare at representable boundaries
# Values that lie exactly between two fp32 numbers
eps = float(np.finfo(np.float32).eps)

print('Rounding modes for values between representable fp32 numbers:')
print(f'{"Value":>14}  {"RN-even":>10}  {"Round-up":>10}  {"Truncate":>10}')

# Test values: exactly halfway between two representable numbers near 1.0
# Between 1.0 and 1.0 + eps: midpoint is 1.0 + eps/2
v = 1.0 + eps/2  # Exact midpoint
import math
rn_even = float(np.float32(v))   # Round to nearest even (default)
trunc = math.floor(v / eps) * eps  # Truncation
rup = math.ceil(v / eps) * eps     # Round up
print(f'{v:14.10f}  {rn_even:10.8f}  {rup:10.8f}  {trunc:10.8f}')

print()
print('Default IEEE 754 rounding: Round-to-nearest-even (RNE)')
print('Stochastic rounding: randomly rounds up or down, unbiased for training')
print()

# Demonstrate bias in round-half-up vs round-half-even
# Sum of 0.5, 1.5, 2.5, 3.5: should average to 2.0
half_values = [0.5, 1.5, 2.5, 3.5, 4.5, 5.5]

def round_half_up(x):   return math.floor(x + 0.5)
def round_half_even(x): return round(x)  # Python uses banker's rounding

rhu = [round_half_up(v)   for v in half_values]
rhe = [round_half_even(v) for v in half_values]
print(f'Round-half-up values:  {rhu}  mean={np.mean(rhu):.3f} (biased high!)')
print(f'Round-half-even values:{rhe}  mean={np.mean(rhe):.3f} (unbiased)')

11. Welford's Online Variance Algorithm

Numerically stable one-pass computation of mean and variance.

Code cell 38

# === 11.1 Welford vs Naive Variance ===

def welford_variance(data):
    """Compute mean and variance using Welford's online algorithm."""
    n = 0
    mean = 0.0
    M2 = 0.0  # Sum of squared deviations
    for x in data:
        n += 1
        delta = x - mean
        mean += delta / n
        delta2 = x - mean
        M2 += delta * delta2
    variance = M2 / (n - 1) if n > 1 else 0.0
    return mean, variance

def naive_variance(data):
    """Naive two-pass variance: Var = E[X^2] - E[X]^2."""
    n = len(data)
    mean = sum(data) / n
    return sum((x - mean)**2 for x in data) / (n - 1)

def naive_one_pass_variance(data):
    """One-pass naive: E[X^2] - (E[X])^2 — catastrophic cancellation!"""
    n = len(data)
    s1 = sum(data)
    s2 = sum(x**2 for x in data)
    return (s2 - s1**2 / n) / (n - 1)

# Test case: values with a large mean (catastrophic cancellation in naive)
# X ~ N(1e8, 1) — large mean, unit variance
np.random.seed(42)
data = list(1e8 + np.random.randn(10000).astype(np.float32))
true_variance = np.var(np.array(data, dtype=np.float64), ddof=1)

mean_w, var_w = welford_variance(data)
var_naive     = naive_variance(data)
var_onepass   = naive_one_pass_variance(data)

print(f'True variance (fp64): {true_variance:.8f}')
print(f'Welford variance:     {var_w:.8f}  error: {abs(var_w - true_variance):.2e}')
print(f'Naive 2-pass:         {var_naive:.8f}  error: {abs(var_naive - true_variance):.2e}')
print(f'Naive 1-pass:         {var_onepass:.8f}  error: {abs(var_onepass - true_variance):.2e}')

ok_w = abs(var_w - true_variance) < 0.01
ok_op = abs(var_onepass - true_variance) > abs(var_w - true_variance)
print(f"{'PASS' if ok_w else 'FAIL'} - Welford is accurate")
print(f"{'PASS' if ok_op else 'FAIL'} - naive 1-pass is less accurate than Welford")

12. Gradient Vanishing and Exploding in Floating Point

Simulate how floating-point precision limits gradient flow in deep networks.

Code cell 40

# === 12.1 Gradient Magnitude Through Layers ===

# Simulate backpropagation through a deep linear network
# Each layer: gradient multiplied by W^T
# For Gaussian weights with std sigma, E[||W^T g||^2] = sigma^2 * fan_in * ||g||^2

def simulate_gradient_flow(n_layers, sigma, n_neurons, dtype=np.float32, n_trials=50):
    """Simulate gradient norms through n_layers of random linear layers."""
    norms = []
    np.random.seed(42)
    for _ in range(n_trials):
        g = np.ones(n_neurons, dtype=dtype)
        layer_norms = [np.linalg.norm(g)]
        for _ in range(n_layers):
            W = np.random.randn(n_neurons, n_neurons).astype(dtype) * sigma
            g = W.T @ g
            layer_norms.append(np.linalg.norm(g))
        norms.append(layer_norms)
    return np.mean(norms, axis=0)

n_neurons = 32
n_layers = 30

# Three scenarios: vanishing, stable (Xavier), exploding
sigma_vanish   = 0.1 / np.sqrt(n_neurons)
sigma_xavier   = 1.0 / np.sqrt(n_neurons)
sigma_explode  = 3.0 / np.sqrt(n_neurons)

norms_v = simulate_gradient_flow(n_layers, sigma_vanish,  n_neurons)
norms_x = simulate_gradient_flow(n_layers, sigma_xavier,  n_neurons)
norms_e = simulate_gradient_flow(n_layers, sigma_explode, n_neurons)

print(f'Gradient norms after {n_layers} layers:')
print(f'  Vanishing  (sigma={sigma_vanish:.4f}): {norms_v[-1]:.4e}')
print(f'  Xavier     (sigma={sigma_xavier:.4f}): {norms_x[-1]:.4f}')
print(f'  Exploding  (sigma={sigma_explode:.4f}): {norms_e[-1]:.4e}')

# fp16 underflow check
fp16_min = float(np.finfo(np.float16).tiny)
print(f'\nfp16 min normal: {fp16_min:.2e}')
print(f'Vanishing grad underflows fp16: {norms_v[-1] < fp16_min}')

Code cell 41

# === 12.2 Visualization: Gradient Flow ===

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    layers = np.arange(n_layers + 1)

    ax = axes[0]
    ax.semilogy(layers, norms_v + 1e-45, color=COLORS['error'], label='Vanishing (sigma=0.1/√n)')
    ax.semilogy(layers, norms_x, color=COLORS['primary'], label='Xavier (sigma=1/√n)')
    ax.semilogy(layers, norms_e, color=COLORS['secondary'], label='Exploding (sigma=3/√n)')
    ax.axhline(float(np.finfo(np.float16).tiny), color=COLORS['neutral'],
               linestyle=':', label='fp16 underflow')
    ax.axhline(float(np.finfo(np.float16).max), color=COLORS['highlight'],
               linestyle=':', label='fp16 overflow')
    ax.set_xlabel('Layer depth')
    ax.set_ylabel('Gradient norm (log scale)')
    ax.set_title('Gradient flow through deep network')
    ax.legend(fontsize=9)

    ax = axes[1]
    ax.plot(layers, norms_x, color=COLORS['primary'], label='Xavier (fp32)')
    # Simulate fp16
    norms_x_fp16 = simulate_gradient_flow(n_layers, sigma_xavier, n_neurons, dtype=np.float16)
    ax.plot(layers, norms_x_fp16.astype(np.float32), color=COLORS['secondary'],
            linestyle='--', label='Xavier (fp16)')
    ax.set_xlabel('Layer depth')
    ax.set_ylabel('Gradient norm')
    ax.set_title('fp32 vs fp16 gradient norms')
    ax.legend()

    fig.tight_layout()
    plt.show()
    print('Plot displayed.')

13. Summary and Verification

Consolidate all key concepts with a final verification suite.

Code cell 43

# === 13.1 Comprehensive Verification Suite ===

results = {}

# 1. Machine epsilon
eps32 = np.finfo(np.float32).eps
results['machine_epsilon'] = np.isclose(eps32, 2**-23)

# 2. Overflow detection
big = np.float32(3.4e38)
overflow = np.isinf(big * np.float32(2.0))
results['overflow_detection'] = overflow

# 3. Kahan summation
arr = np.full(10000, np.float32(1e-7))
naive_s  = np.float32(0.0)
for x in arr: naive_s += x
kahan_s = kahan_sum_fp32(arr)
true_s  = 1e-7 * 10000
results['kahan_better_than_naive'] = abs(kahan_s - true_s) < abs(float(naive_s) - true_s)

# 4. Stable logsumexp
z = np.array([1000.0, 1001.0, 999.0], dtype=np.float32)
lse_stable = logsumexp_stable(z)
lse_ref    = float(np.log(np.sum(np.exp(z.astype(np.float64)))))
results['logsumexp_stable'] = np.isclose(lse_stable, lse_ref, rtol=1e-4)

# 5. Softmax translation invariance
z = np.array([1.0, 2.0, 3.0])
s1 = softmax_stable(z)
s2 = softmax_stable(z - 100)
results['softmax_translation_invariant'] = np.allclose(s1, s2)

# 6. Cancellation in quadratic formula
r2_good = np.float64(1.0) / r1_stable  # Vieta's formula
r2_naive_err = abs(r2_standard - r2_true) / r2_true
r2_stable_err = abs(r2_good - r2_true) / r2_true
results['vieta_formula_stable'] = r2_stable_err < r2_naive_err * 0.001

# 7. NaN propagation
nan = np.float32('nan')
results['nan_propagates'] = np.isnan(nan + 1.0)

# 8. Hilbert matrix ill-conditioned
H4 = build_hilbert(4)
results['hilbert_ill_conditioned'] = nla.cond(H4) > 1e4

# Print results
print('=== Verification Suite ===')
all_pass = True
for name, passed in results.items():
    status = 'PASS' if passed else 'FAIL'
    all_pass = all_pass and passed
    print(f'  [{status}] {name}')

print()
print(f'Overall: {"ALL PASS" if all_pass else "SOME FAILURES"}')
print(f'Passed: {sum(results.values())}/{len(results)}')

Key Takeaways

Concept	Rule of Thumb
Machine epsilon	fp32: $2^{-23} \approx 1.2 \times 10^{-7}$ ; bf16: $2^{-7} \approx 7.8 \times 10^{-3}$
Catastrophic cancellation	Occurs when subtracting nearly-equal quantities; use algebraic identities
Kahan summation	Reduces $O(n\varepsilon)$ error to $O(\varepsilon)$ using compensation
Condition number	$\kappa(A)$ tells how many digits are lost in $Ax = b$ ; $\kappa > 10^d$ loses $d$ digits
Mixed precision	bf16 compute + fp32 master weights is standard for LLM training
Loss scaling	Essential for fp16; multiply loss by $S$ , check for overflow, divide gradients by $S$
Stable softmax	Always subtract `max` before `exp`; use `log-sum-exp` for numerical log-softmax
expm1/log1p	Use these for $e^x - 1$ and $\ln(1+x)$ when $x \approx 0$

Next: Numerical Linear Algebra →

Floating Point Arithmetic