Theory Notebook
Converted from
theory.ipynbfor web reading.
Floating-Point Arithmetic — Theory Notebook
"The floating-point number system is a finite subset of the reals, and this finiteness has consequences."
Interactive derivations covering: IEEE 754 encoding, machine epsilon, catastrophic cancellation, Kahan summation, condition numbers, stable algorithms (log-sum-exp, softmax), and mixed-precision training.
Companion: notes.md | exercises.ipynb
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import numpy as np
import struct
import sys
try:
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style='whitegrid', palette='colorblind')
HAS_SNS = True
except ImportError:
plt.style.use('seaborn-v0_8-whitegrid')
HAS_SNS = False
mpl.rcParams.update({
'figure.figsize': (10, 6), 'figure.dpi': 120,
'font.size': 13, 'axes.titlesize': 15, 'axes.labelsize': 13,
'xtick.labelsize': 11, 'ytick.labelsize': 11,
'legend.fontsize': 11, 'lines.linewidth': 2.0,
'axes.spines.top': False, 'axes.spines.right': False,
})
HAS_MPL = True
except ImportError:
HAS_MPL = False
COLORS = {
'primary': '#0077BB',
'secondary': '#EE7733',
'tertiary': '#009988',
'error': '#CC3311',
'neutral': '#555555',
'highlight': '#EE3377',
}
np.set_printoptions(precision=15, suppress=False)
np.random.seed(42)
print('Setup complete.')
print(f'NumPy version: {np.__version__}')
print(f'Python float: {sys.float_info}')
print("Chapter helper setup complete.")
1. IEEE 754 Bit-Level Encoding
We decode the actual bits of floating-point numbers to understand the standard directly.
Code cell 5
# === 1.1 IEEE 754 Bit Decomposition ===
def fp32_bits(x):
"""Return the 32-bit IEEE 754 representation of a float32."""
packed = struct.pack('>f', np.float32(x))
bits = int.from_bytes(packed, 'big')
sign = (bits >> 31) & 1
exp = (bits >> 23) & 0xFF
mant = bits & 0x7FFFFF
return sign, exp, mant, f'{bits:032b}'
def decode_fp32(x):
"""Decode and print fp32 number."""
sign, exp, mant, bstr = fp32_bits(x)
print(f'Value: {np.float32(x)}')
print(f'Bits: {bstr[0]} {bstr[1:9]} {bstr[9:]}')
print(f' sign exponent mantissa')
if exp == 0 and mant == 0:
print(f' ({["+","-"][sign]}0 — signed zero)')
elif exp == 255 and mant == 0:
print(f' ({["+","-"][sign]}Infinity)')
elif exp == 255:
print(f' (NaN)')
elif exp == 0:
val = (-1)**sign * mant * 2**(-149)
print(f' Subnormal: (-1)^{sign} * 0.{mant:023b} * 2^(-126) = {val:.6e}')
else:
val = (-1)**sign * (1 + mant/2**23) * 2**(exp - 127)
print(f' Normal: (-1)^{sign} * 1.{mant:023b} * 2^({exp}-127) = {val:.6e}')
print()
print('=== Decoding key fp32 values ===')
for x in [1.0, -1.0, 0.5, 3.14, 0.0, float('inf'), float('nan')]:
decode_fp32(x)
Code cell 6
# === 1.2 Computing Machine Epsilon from First Principles ===
# Method 1: Iterate by halving
eps_fp64 = np.float64(1.0)
while np.float64(1.0) + eps_fp64 / 2 > np.float64(1.0):
eps_fp64 /= 2
eps_fp32 = np.float32(1.0)
while np.float32(1.0) + np.float32(eps_fp32) / 2 > np.float32(1.0):
eps_fp32 = np.float32(eps_fp32) / np.float32(2.0)
print('Machine epsilon by iteration:')
print(f' fp64: {eps_fp64:.20e} (theory: 2^-52 = {2**-52:.20e})')
print(f' fp32: {eps_fp32:.10e} (theory: 2^-23 = {2**-23:.10e})')
print()
# Method 2: NumPy info
print('NumPy machine epsilon:')
print(f' fp64: {np.finfo(np.float64).eps:.20e}')
print(f' fp32: {np.finfo(np.float32).eps:.10e}')
print(f' fp16: {np.finfo(np.float16).eps:.6e}')
print()
# Verify: 1.0 + eps is representable, 1.0 + eps/2 is not
ok = (np.float32(1.0) + eps_fp32 > np.float32(1.0)) and \
(np.float32(1.0) + eps_fp32/2 == np.float32(1.0))
print(f"{'PASS' if ok else 'FAIL'} - machine epsilon verified for fp32")
Code cell 7
# === 1.3 Density of Representable Numbers ===
# Near 1.0, spacing between consecutive fp32 numbers = eps_mach
# Near 2.0, spacing = 2 * eps_mach, etc.
def fp32_ulp(x):
"""Unit in the last place for fp32."""
x = np.float32(x)
_, exp, _, _ = fp32_bits(x)
if exp == 0: # subnormal
return np.float32(2**-149)
return np.float32(2**(exp - 127 - 23))
print('ULP (spacing to next fp32) at key values:')
for x in [0.5, 1.0, 2.0, 4.0, 1e6, 1e10, 65504.0]:
ulp = fp32_ulp(x)
print(f' x = {x:12.1f}: ULP = {ulp:.3e} '
f'(relative: {ulp/x:.3e})')
print()
print(f'fp32 max value: {np.finfo(np.float32).max:.6e}')
print(f'fp16 max value: {np.finfo(np.float16).max:.1f}')
print(f'fp32 min normal: {np.finfo(np.float32).tiny:.6e}')
print(f'fp32 min subnormal: {np.nextafter(np.float32(0), np.float32(1)):.6e}')
2. Rounding Error and Catastrophic Cancellation
We demonstrate that subtraction of nearly-equal numbers can destroy all significant digits.
Code cell 9
# === 2.1 Catastrophic Cancellation ===
# Example: compute 1 - cos(x) for small x
# True value ≈ x^2/2 for small x
x_vals = np.logspace(-1, -8, 50) # From 0.1 to 1e-8
true_val = x_vals**2 / 2 # Leading-order Taylor approximation
# Method 1: Direct (catastrophic cancellation for small x)
direct_fp32 = np.float32(1.0) - np.cos(x_vals.astype(np.float32))
direct_fp64 = 1.0 - np.cos(x_vals)
# Method 2: Stable (use 2*sin^2(x/2) = 1 - cos(x))
stable_fp32 = 2 * np.sin(x_vals.astype(np.float32)/2)**2
# Relative errors
# Use fp64 direct as reference for fp64, and stable fp32 as reference for fp32
ref = 2 * np.sin(x_vals/2)**2 # Stable fp64 as ground truth
err_direct32 = np.abs(direct_fp32 - ref) / (ref + 1e-300)
err_stable32 = np.abs(stable_fp32 - ref) / (ref + 1e-300)
err_direct64 = np.abs(direct_fp64 - ref) / (ref + 1e-300)
print('Relative error in computing 1 - cos(x):')
print(f'{"x":>12} {"Direct fp32":>14} {"Stable fp32":>14} {"Direct fp64":>14}')
for i in [0, 10, 20, 30, 40, 45]:
print(f'{x_vals[i]:12.2e} {err_direct32[i]:14.3e} '
f'{err_stable32[i]:14.3e} {err_direct64[i]:14.3e}')
Code cell 10
# === 2.2 Visualization: Catastrophic Cancellation ===
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Left: Relative error vs x
ax = axes[0]
ax.loglog(x_vals, err_direct32 + 1e-16,
color=COLORS['error'], label='Direct fp32 (1 - cos x)', linewidth=2)
ax.loglog(x_vals, err_stable32 + 1e-16,
color=COLORS['primary'], label='Stable fp32 (2 sin²(x/2))', linewidth=2)
ax.loglog(x_vals, err_direct64 + 1e-16,
color=COLORS['secondary'], label='Direct fp64', linewidth=2, linestyle='--')
ax.axhline(np.finfo(np.float32).eps, color=COLORS['neutral'],
linestyle=':', label='fp32 ε_mach')
ax.set_xlabel('x')
ax.set_ylabel('Relative error')
ax.set_title('Catastrophic cancellation: 1 - cos(x)')
ax.legend(fontsize=10)
# Right: Significant digits lost
ax = axes[1]
digits_direct = -np.log10(err_direct32 + 1e-16)
digits_stable = -np.log10(err_stable32 + 1e-16)
ax.semilogx(x_vals, np.clip(digits_direct, 0, 8),
color=COLORS['error'], label='Direct fp32', linewidth=2)
ax.semilogx(x_vals, np.clip(digits_stable, 0, 8),
color=COLORS['primary'], label='Stable fp32', linewidth=2)
ax.axhline(7, color=COLORS['neutral'], linestyle=':', label='fp32 max digits (~7)')
ax.set_xlabel('x')
ax.set_ylabel('Significant digits correct')
ax.set_title('Significant digits: stable vs. direct computation')
ax.legend()
fig.tight_layout()
plt.show()
print('Plot displayed.')
Code cell 11
# === 2.3 Quadratic Formula Cancellation ===
# For x^2 - 56x + 1 = 0, roots are ~55.982 and ~0.01786
# The standard formula loses precision for the small root
a, b, c = np.float64(1.0), np.float64(-56.0), np.float64(1.0)
disc = np.sqrt(b**2 - 4*a*c)
# Standard formula
r1_standard = (-b + disc) / (2*a)
r2_standard = (-b - disc) / (2*a) # Small root — catastrophic cancellation!
# Stable formula for small root: c / (a * r1)
r1_stable = (-b + disc) / (2*a) # Large root — accurate
r2_stable = c / (a * r1_stable) # Small root via Vieta's formula
# True values (computed symbolically)
r1_true = 28 + np.sqrt(783) # = 28 + 27.982...
r2_true = 28 - np.sqrt(783) # = 28 - 27.982... = 0.01786...
print('Roots of x^2 - 56x + 1 = 0:')
print(f'True r1 = {r1_true:.15f}')
print(f'True r2 = {r2_true:.15f}')
print()
print(f'Standard r1 = {r1_standard:.15f} error: {abs(r1_standard - r1_true):.2e}')
print(f'Standard r2 = {r2_standard:.15f} error: {abs(r2_standard - r2_true):.2e}')
print()
print(f'Stable r1 = {r1_stable:.15f} error: {abs(r1_stable - r1_true):.2e}')
print(f'Stable r2 = {r2_stable:.15f} error: {abs(r2_stable - r2_true):.2e}')
rel_err_standard = abs(r2_standard - r2_true) / r2_true
rel_err_stable = abs(r2_stable - r2_true) / r2_true
print()
print(f'Relative error (standard): {rel_err_standard:.2e}')
print(f'Relative error (stable): {rel_err_stable:.2e}')
ok = rel_err_stable < 1e-12 and rel_err_standard > 1e-6
print(f"{'PASS' if ok else 'FAIL'} - stable formula avoids cancellation")
3. Kahan Compensated Summation
Demonstrate that naive summation error grows with , while Kahan achieves error.
Code cell 13
# === 3.1 Naive vs Kahan Summation Error ===
def naive_sum_fp32(arr):
s = np.float32(0.0)
for x in arr:
s += np.float32(x)
return s
def kahan_sum_fp32(arr):
s = np.float32(0.0)
c = np.float32(0.0) # Compensation
for x in arr:
y = np.float32(x) - c
t = s + y
c = (t - s) - y # Captured rounding error
s = t
return s
# Sum of n copies of a small value epsilon where n*eps is O(1)
# True sum = 1.0
eps_val = np.float32(1e-7) # Near machine epsilon
n_vals = [10, 100, 1000, 10000, 100000]
print(f'Summing n copies of {eps_val}')
print(f'{"n":>8} {"Naive fp32":>14} {"Kahan fp32":>14} {"NumPy fp64":>14} {"True":>14}')
for n in n_vals:
arr = np.full(n, eps_val, dtype=np.float32)
true_val = float(eps_val) * n
naive = float(naive_sum_fp32(arr))
kahan = float(kahan_sum_fp32(arr))
fp64 = float(np.sum(arr.astype(np.float64)))
print(f'{n:8d} {naive:14.8f} {kahan:14.8f} {fp64:14.8f} {true_val:14.8f}')
Code cell 14
# === 3.2 Error Growth with n ===
n_trials = list(range(1, 201))
naive_errors = []
kahan_errors = []
# Sum of n random fp32 values — compare to fp64 sum as reference
np.random.seed(42)
base_arr = np.random.uniform(1.0, 2.0, 200).astype(np.float32)
for n in n_trials:
arr = base_arr[:n]
true_val = float(np.sum(arr.astype(np.float64))) # fp64 as ground truth
naive = float(naive_sum_fp32(arr))
kahan = float(kahan_sum_fp32(arr))
naive_errors.append(abs(naive - true_val) / abs(true_val))
kahan_errors.append(abs(kahan - true_val) / abs(true_val) + 1e-20)
if HAS_MPL:
fig, ax = plt.subplots(figsize=(10, 5))
ax.semilogy(n_trials, naive_errors, color=COLORS['error'], label='Naive fp32 sum')
ax.semilogy(n_trials, kahan_errors, color=COLORS['primary'], label='Kahan fp32 sum')
ax.axhline(float(np.finfo(np.float32).eps), color=COLORS['neutral'],
linestyle='--', label='fp32 ε_mach')
ax.set_xlabel('Number of elements n')
ax.set_ylabel('Relative error (log scale)')
ax.set_title('Error growth: Naive vs Kahan summation')
ax.legend()
plt.tight_layout()
plt.show()
print('Plot displayed.')
# Verify Kahan error stays near machine epsilon
eps_mach = float(np.finfo(np.float32).eps)
ok = max(kahan_errors) < 10 * eps_mach
print(f"{'PASS' if ok else 'FAIL'} - Kahan error bounded by ~10*eps_mach")
print(f'Max Kahan error: {max(kahan_errors):.2e} (eps_mach = {eps_mach:.2e})')
4. Condition Numbers and Error Amplification
Demonstrate how the condition number bounds forward error.
Code cell 16
# === 4.1 Condition Number of Scalar Functions ===
# For f(x) = sqrt(x), cond = |x f'(x)| / |f(x)| = 1/2
# For f(x) = 1 - x near x=1, cond = x / (1-x) -> infinity as x -> 1
x_vals = np.linspace(0.01, 0.999, 300)
# Condition number of subtraction f(x) = 1 - x near x = 1
# cond = |x| / |1 - x|
cond_subtract = np.abs(x_vals) / np.abs(1 - x_vals)
# Condition number of sqrt: 1/2
cond_sqrt = np.full_like(x_vals, 0.5)
# Condition number of exp: |x|
cond_exp = np.abs(x_vals)
if HAS_MPL:
fig, ax = plt.subplots(figsize=(10, 5))
ax.semilogy(x_vals, cond_subtract, color=COLORS['error'], label='1 - x (cancellation)')
ax.semilogy(x_vals, cond_exp, color=COLORS['secondary'], label='exp(x)')
ax.axhline(0.5, color=COLORS['primary'], linestyle='--', label='sqrt(x) (cond = 0.5)')
ax.set_xlabel('x')
ax.set_ylabel('Condition number (log scale)')
ax.set_title('Condition numbers of scalar functions')
ax.legend()
plt.tight_layout()
plt.show()
print('Plot displayed.')
x = np.float64(0.9999)
cond = abs(x) / abs(1 - x)
eps_mach = np.finfo(np.float64).eps
print(f'At x = 0.9999: cond(1-x) = {cond:.1f}')
print(f'Expected relative error in (1-x): ~{cond * eps_mach:.2e}')
print(f'Actual relative error: {abs((1-x) - (1-x))/(1-x):.2e} (exact here)')
Code cell 17
# === 4.2 Matrix Condition Number and Linear System Sensitivity ===
import numpy.linalg as nla
def build_hilbert(n):
"""Hilbert matrix H_{ij} = 1/(i+j-1), notoriously ill-conditioned."""
i, j = np.meshgrid(np.arange(1, n+1), np.arange(1, n+1), indexing='ij')
return 1.0 / (i + j - 1)
print('Condition numbers of Hilbert matrices:')
print(f'{"n":>4} {"kappa(H_n)":>16} {"log10(kappa)":>12}')
for n in range(2, 13):
H = build_hilbert(n)
kappa = nla.cond(H)
print(f'{n:4d} {kappa:16.4e} {np.log10(kappa):12.1f}')
print()
print('As n increases, Hilbert matrix becomes exponentially ill-conditioned.')
print(f'For n=10, kappa ~ 10^13, so 13 digits of x are lost in solving Hx=b with fp64 (15 digits).')
Code cell 18
# === 4.3 Perturbation Test: Forward Error Bound ===
# Solve Hx = b for two right-hand sides that differ by a tiny perturbation
# Forward error ~ kappa(H) * relative perturbation
n = 8
H = build_hilbert(n)
kappa = nla.cond(H)
# True solution: all ones
x_true = np.ones(n)
b = H @ x_true
# Perturbed RHS
np.random.seed(0)
delta_b = np.random.randn(n) * 1e-10 # tiny perturbation
b_perturbed = b + delta_b
# Solve both systems
x_computed = nla.solve(H, b)
x_perturbed = nla.solve(H, b_perturbed)
rel_perturb_b = nla.norm(delta_b) / nla.norm(b)
rel_error_x = nla.norm(x_perturbed - x_computed) / nla.norm(x_computed)
amplification = rel_error_x / rel_perturb_b
print(f'Hilbert matrix H_{n}: kappa = {kappa:.3e}')
print(f'Relative perturbation in b: {rel_perturb_b:.2e}')
print(f'Resulting relative error in x: {rel_error_x:.2e}')
print(f'Error amplification factor: {amplification:.2e}')
print(f'Predicted bound (kappa): {kappa:.2e}')
ok = amplification <= kappa * 10 # Within order of magnitude
print(f"{'PASS' if ok else 'FAIL'} - amplification bounded by kappa")
5. Stable Algorithms for AI
Implement and verify log-sum-exp, stable softmax, and stable cross-entropy.
Code cell 20
# === 5.1 Log-Sum-Exp Stability ===
def logsumexp_naive(x):
"""Naive implementation — overflows for large x."""
return np.log(np.sum(np.exp(x)))
def logsumexp_stable(x):
"""Stable implementation using shift by max."""
c = np.max(x)
return c + np.log(np.sum(np.exp(x - c)))
# Test with large values (overflow territory for fp32)
x_large = np.array([100.0, 200.0, 300.0, 400.0], dtype=np.float32)
x_small = np.array([-1000.0, -2000.0, -500.0], dtype=np.float32)
print('Large values (fp32):')
print(f' naive = {logsumexp_naive(x_large)} (Inf — overflow!)')
print(f' stable = {logsumexp_stable(x_large):.6f}')
print(f' scipy = {float(np.log(np.sum(np.exp(x_large.astype(np.float64))))):.6f} (fp64 ref)')
print()
print('Small values (fp32):')
print(f' naive = {logsumexp_naive(x_small)}')
print(f' stable = {logsumexp_stable(x_small):.6f}')
# Verify stable equals fp64 computation
x_test = np.array([1.0, 2.0, 3.0], dtype=np.float64)
from_scipy = np.log(np.sum(np.exp(x_test)))
from_stable = logsumexp_stable(x_test)
ok = np.isclose(from_scipy, from_stable)
print(f"\n{'PASS' if ok else 'FAIL'} - stable logsumexp matches reference")
Code cell 21
# === 5.2 Stable Softmax ===
def softmax_naive(x):
e = np.exp(x)
return e / e.sum()
def softmax_stable(x):
e = np.exp(x - np.max(x))
return e / e.sum()
# Test: large logits (typical in early training)
logits_large = np.array([1000.0, 1001.0, 999.0], dtype=np.float32)
print('Softmax with large logits (fp32):')
try:
naive_result = softmax_naive(logits_large)
print(f' Naive: {naive_result} (NaN from overflow!)')
except:
print(' Naive: RuntimeError from overflow')
stable_result = softmax_stable(logits_large)
print(f' Stable: {stable_result}')
print(f' Sum: {stable_result.sum():.6f} (should be 1.0)')
# Verify: softmax([1000, 1001, 999]) = softmax([0, 1, -1])
reference = softmax_stable(np.array([0.0, 1.0, -1.0]))
ok = np.allclose(stable_result, reference, atol=1e-5)
print(f"{'PASS' if ok else 'FAIL'} - stable softmax equals shifted version")
print()
# Demonstrate across a range of magnitudes
print('Stable softmax properties:')
for scale in [1, 10, 100, 1000]:
z = np.array([scale, scale+1, scale-1], dtype=np.float64)
s = softmax_stable(z)
print(f' scale={scale:4d}: {s.round(4)}, sum={s.sum():.6f}')
Code cell 22
# === 5.3 Stable Cross-Entropy Loss ===
def cross_entropy_naive(logits, target_idx):
"""Naive: compute softmax then log."""
probs = softmax_naive(logits)
return -np.log(probs[target_idx])
def cross_entropy_stable(logits, target_idx):
"""Stable: log-softmax = logit - log-sum-exp."""
log_softmax = logits - logsumexp_stable(logits)
return -log_softmax[target_idx]
# Test case: model very confident about wrong class
# logits: class 0 gets 1000, target is class 1
logits = np.array([1000.0, 2.0, 1.0], dtype=np.float32)
target = 1
print('Cross-entropy with extreme logits:')
try:
loss_naive = cross_entropy_naive(logits, target)
print(f' Naive: {loss_naive:.6f}')
except:
print(' Naive: Failed (NaN/Inf)')
loss_stable = cross_entropy_stable(logits, target)
print(f' Stable: {loss_stable:.6f}')
# Check: log softmax = logit - log(sum exp)
# For target=1: -log_softmax[1] = -logit[1] + logsumexp = -(2 - 1000) = 998
# Because softmax(1) ≈ exp(2-1000) / (exp(0) + 1 + ...) ≈ exp(-998)
expected_approx = 1000.0 - 2.0 # Approximately
print(f' Expected (approx): {expected_approx:.1f}')
print(f' Computed: {loss_stable:.6f}')
ok = abs(loss_stable - expected_approx) < 5.0 # Within reasonable tolerance
print(f"{'PASS' if ok else 'FAIL'} - stable cross-entropy gives sensible result")
6. Special Values: NaN, Infinity, and Signed Zero
Explore how IEEE 754 special values propagate through computations.
Code cell 24
# === 6.1 NaN Propagation ===
nan = np.float32('nan')
inf = np.float32('inf')
zero = np.float32(0.0)
print('NaN arithmetic rules (IEEE 754):')
ops = [
('nan + 1', nan + 1),
('nan * 0', nan * zero),
('nan * inf', nan * inf),
('0 / 0', zero / zero),
('inf - inf', inf - inf),
('inf * 0', inf * zero),
]
for name, val in ops:
print(f' {name} = {val}')
print()
print('Infinity arithmetic:')
ops_inf = [
('inf + 1', inf + 1),
('inf + inf', inf + inf),
('1 / 0', np.float32(1.0) / zero),
('1 / inf', np.float32(1.0) / inf),
('-1 / 0', np.float32(-1.0) / zero),
]
for name, val in ops_inf:
print(f' {name} = {val}')
print()
print('NaN detection (NaN != NaN is True):')
print(f' nan == nan: {nan == nan}')
print(f' nan != nan: {nan != nan}')
print(f' np.isnan(nan): {np.isnan(nan)}')
ok = np.isnan(nan) and not (nan == nan)
print(f"{'PASS' if ok else 'FAIL'} - NaN properties verified")
Code cell 25
# === 6.2 Signed Zero ===
pos_zero = np.float32(+0.0)
neg_zero = np.float32(-0.0)
print('Signed zero properties:')
print(f' +0.0 == -0.0: {pos_zero == neg_zero}')
print(f' 1 / (+0.0) = {np.float32(1.0) / pos_zero}')
print(f' 1 / (-0.0) = {np.float32(1.0) / neg_zero}')
print(f' str(+0.0) = {pos_zero}')
print(f' str(-0.0) = {neg_zero}')
# Bit difference
_, _, _, pbits = fp32_bits(+0.0)
_, _, _, nbits = fp32_bits(-0.0)
print(f' +0.0 bits: {pbits}')
print(f' -0.0 bits: {nbits}')
print()
print('Practical impact: relu gradient at x=0')
# relu(x) = max(0, x)
# At x=0, the gradient is conventionally 0 (subgradient)
# Some implementations return +0, others -0
x = np.float32(0.0)
relu_at_zero = np.maximum(np.float32(0.0), x)
print(f' relu(0) = {relu_at_zero}')
print(f' relu(0) is pos zero: {not np.signbit(relu_at_zero)}')
7. Numerical Formats for AI: Mixed Precision Simulation
Simulate mixed-precision training dynamics and quantization effects.
Code cell 27
# === 7.1 Precision Comparison: Format Ranges ===
formats = {
'fp64': np.float64,
'fp32': np.float32,
'fp16': np.float16,
}
print(f'{"Format":>8} {"eps_mach":>14} {"max":>12} {"min_normal":>12}')
for name, dtype in formats.items():
info = np.finfo(dtype)
print(f'{name:>8} {info.eps:14.3e} {info.max:12.3e} {info.tiny:12.3e}')
# bf16: same exponent as fp32, fewer mantissa bits
# bf16 eps = 2^-7 ≈ 7.8e-3, max ≈ 3.4e38 (same as fp32)
bf16_eps = 2**-7
bf16_max = (2 - 2**-7) * 2**127
print(f'{"bf16":>8} {bf16_eps:14.3e} {bf16_max:12.3e} {"~1.2e-38":>12}')
print()
print('Key insight: bf16 has same RANGE as fp32 but much lower PRECISION.')
print('fp16 has better precision than bf16 but much smaller range.')
Code cell 28
# === 7.2 Simulating bf16 Quantization ===
def to_bf16(x):
"""Simulate bf16 by truncating fp32 mantissa to 7 bits."""
x32 = np.float32(x)
# Reinterpret as uint32, zero out lower 16 bits
bits = np.frombuffer(x32.tobytes(), dtype=np.uint32)[0]
bf16_bits = bits & 0xFFFF0000 # Keep upper 16 bits (sign, exp, 7 mantissa)
return np.frombuffer(np.uint32(bf16_bits).tobytes(), dtype=np.float32)[0]
# Test quantization error
test_vals = np.array([0.1, 0.5, 1.0, 3.14159, 100.0, 0.001], dtype=np.float32)
print(f'{"Value":>12} {"bf16 approx":>14} {"rel error":>12}')
for v in test_vals:
v_bf16 = to_bf16(v)
rel_err = abs(v_bf16 - v) / abs(v)
print(f'{v:12.6f} {v_bf16:14.6f} {rel_err:12.4e}')
print()
# bf16 eps_mach ≈ 7.8e-3 so relative errors should be at most this
bf16_eps = 2**-7
print(f'bf16 machine epsilon: {bf16_eps:.4e}')
ok = all(abs(to_bf16(v) - v) / abs(v) <= 2 * bf16_eps for v in test_vals)
print(f"{'PASS' if ok else 'FAIL'} - bf16 quantization errors within 2*eps_mach")
Code cell 29
# === 7.3 Loss Scaling Simulation ===
# Simulate mixed-precision training with dynamic loss scaling
# Scenario: gradients in range [1e-4, 1e-3] — close to fp16 underflow
np.random.seed(42)
n_params = 1000
n_steps = 50
# Simulate training without and with loss scaling
def simulate_step(grads_fp32, scale=1.0, fp16_max=65504.0):
"""Simulate one step of fp16 gradient."""
scaled_grads = grads_fp32 * scale
# Check overflow
overflow = np.any(np.abs(scaled_grads) > fp16_max)
if overflow:
return None, True # Skip step
# Quantize to fp16 precision
grads_fp16 = scaled_grads.astype(np.float16).astype(np.float32)
grads_unscaled = grads_fp16 / scale
return grads_unscaled, False
# True gradients: small magnitude, requires scaling to survive fp16
true_grads = np.random.uniform(5e-5, 5e-4, n_params) # Small gradients
# No scaling
g_no_scale, ovf = simulate_step(true_grads, scale=1.0)
n_underflow_no = np.sum(g_no_scale == 0) if g_no_scale is not None else 0
# With loss scaling (S = 1024)
g_scaled, ovf = simulate_step(true_grads, scale=1024.0)
n_underflow_s = np.sum(g_scaled == 0) if g_scaled is not None else 0
print('Gradient survival with fp16:')
print(f' True gradient range: [{true_grads.min():.2e}, {true_grads.max():.2e}]')
print(f' fp16 min subnormal: {np.finfo(np.float16).tiny:.2e}')
print()
print(f' No scaling: {n_underflow_no}/{n_params} gradients underflow to zero')
if g_no_scale is not None:
print(f' Effective gradient: [{g_no_scale.min():.2e}, {g_no_scale.max():.2e}]')
print()
print(f' Scale=1024: {n_underflow_s}/{n_params} gradients underflow to zero')
if g_scaled is not None:
print(f' Effective gradient: [{g_scaled.min():.2e}, {g_scaled.max():.2e}]')
ok = n_underflow_no > n_underflow_s
print(f"{'PASS' if ok else 'FAIL'} - loss scaling reduces gradient underflow")
8. Stable Elementary Functions: expm1 and log1p
For arguments near zero, and require special implementations.
Code cell 31
# === 8.1 expm1 vs exp-1 ===
# For small x: exp(x) - 1 suffers catastrophic cancellation
# expm1(x) uses the identity: expm1(x) = x + x^2/2 + x^3/6 + ...
x_small = np.logspace(-15, -1, 100)
naive_result = np.exp(x_small) - 1.0
stable_result = np.expm1(x_small)
# Reference: Taylor series (exact for tiny x)
# expm1(x) ≈ x + x^2/2 + x^3/6 for |x| << 1
taylor_ref = x_small + x_small**2/2 + x_small**3/6
# Relative errors
err_naive = np.abs(naive_result - taylor_ref) / (taylor_ref + 1e-300)
err_stable = np.abs(stable_result - taylor_ref) / (taylor_ref + 1e-300)
if HAS_MPL:
fig, ax = plt.subplots(figsize=(10, 5))
ax.loglog(x_small, err_naive, color=COLORS['error'], label='exp(x) - 1 (naive)')
ax.loglog(x_small, err_stable + 1e-17, color=COLORS['primary'], label='expm1(x) (stable)')
ax.axhline(np.finfo(np.float64).eps, color=COLORS['neutral'],
linestyle='--', label='fp64 ε_mach')
ax.set_xlabel('x')
ax.set_ylabel('Relative error')
ax.set_title('expm1 vs exp(x)-1 for small x')
ax.legend()
plt.tight_layout()
plt.show()
print('Plot displayed.')
print('Comparison at x = 1e-10:')
x = 1e-10
print(f' exp(x) - 1 = {np.exp(x) - 1:.20e}')
print(f' expm1(x) = {np.expm1(x):.20e}')
print(f' x (leading term) = {x:.20e}')
Code cell 32
# === 8.2 log1p vs log(1+x) ===
x_small = np.logspace(-15, -1, 100)
naive_log = np.log(1 + x_small)
stable_log = np.log1p(x_small)
taylor_ref = x_small - x_small**2/2 + x_small**3/3 # Taylor series
err_naive = np.abs(naive_log - taylor_ref) / (np.abs(taylor_ref) + 1e-300)
err_stable = np.abs(stable_log - taylor_ref) / (np.abs(taylor_ref) + 1e-300)
print('log1p precision at small x:')
for x in [1e-15, 1e-12, 1e-8, 1e-4, 1e-2]:
naive = np.log(1.0 + x)
stable = np.log1p(x)
true_approx = x # Leading term
print(f' x={x:.0e}: log(1+x)={naive:.15e} log1p(x)={stable:.15e}')
print()
# Application: Information-theoretic log-likelihood
# -log(1 - p) for p near 0 (sparse attention weights)
p = 1e-12
print(f'Negative log-likelihood at p={p:.0e}:')
print(f' -log(1-p) naive = {-np.log(1-p):.15e}')
print(f' -log1p(-p) stable = {-np.log1p(-p):.15e}')
9. Floating-Point Number Line
Visualize the non-uniform distribution of representable numbers.
Code cell 34
# === 9.1 Floating-Point Number Line Density ===
# For a tiny 4-bit float (sign=1, exp=2, mantissa=1)
# to illustrate the structure
# Generate all positive fp16 values
all_bits = np.arange(0, 0x7C00, dtype=np.uint16) # Exclude inf/NaN
# Convert to fp16
fp16_vals = np.frombuffer(all_bits.tobytes(), dtype=np.float16)
fp16_vals = fp16_vals.astype(np.float64)
print(f'Total positive finite fp16 values: {len(fp16_vals)}')
print(f'Range: [{fp16_vals.min():.4e}, {fp16_vals.max():.4e}]')
if HAS_MPL:
fig, axes = plt.subplots(2, 1, figsize=(14, 8))
# Top: all values on number line
ax = axes[0]
# Sample for clarity
sample = fp16_vals[::64]
ax.scatter(sample, np.zeros_like(sample), s=3,
alpha=0.5, color=COLORS['primary'])
ax.set_xlabel('Value')
ax.set_yticks([])
ax.set_title('Positive fp16 values (sampled): denser near zero')
ax.set_xlim(-1, 70)
# Bottom: gap sizes vs value (log-log shows linear relationship)
ax = axes[1]
gaps = np.diff(fp16_vals[::8])
midpoints = (fp16_vals[::8][:-1] + fp16_vals[::8][1:]) / 2
ok_mask = gaps > 0
ax.loglog(midpoints[ok_mask], gaps[ok_mask],
color=COLORS['primary'], linewidth=1.5)
ax.set_xlabel('Value (log scale)')
ax.set_ylabel('Gap to next fp16 (log scale)')
ax.set_title('fp16 spacing: doubles with each power of 2')
fig.tight_layout()
plt.show()
print('Plot displayed.')
# Verify: relative spacing is constant = eps_mach within each binade
eps16 = float(np.finfo(np.float16).eps)
ok = True
for v in [0.5, 1.0, 2.0, 4.0]:
v16 = np.float16(v)
next_v16 = np.nextafter(v16, np.float16(float('inf')))
gap = float(next_v16) - float(v16)
rel_gap = gap / float(v16)
ok = ok and np.isclose(rel_gap, eps16, rtol=0.1)
print(f' v={float(v16):.1f}: gap={gap:.4e}, relative={rel_gap:.4e} (eps={eps16:.4e})')
print(f"{'PASS' if ok else 'FAIL'} - relative spacing equals eps_mach")
10. Rounding Modes
IEEE 754 defines five rounding modes. Round-to-nearest-even is the default.
Code cell 36
# === 10.1 Rounding Mode Comparison ===
# Demonstrate rounding modes using exact arithmetic via Python Fraction
from fractions import Fraction
def round_to_nearest_even(x, eps):
"""Round x to nearest multiple of eps, ties go to even."""
quotient = x / eps
lower = int(quotient) # floor
upper = lower + 1
if quotient - lower < 0.5:
return lower * eps
elif quotient - lower > 0.5:
return upper * eps
else: # Tie: pick even
return (lower if lower % 2 == 0 else upper) * eps
# Compare at representable boundaries
# Values that lie exactly between two fp32 numbers
eps = float(np.finfo(np.float32).eps)
print('Rounding modes for values between representable fp32 numbers:')
print(f'{"Value":>14} {"RN-even":>10} {"Round-up":>10} {"Truncate":>10}')
# Test values: exactly halfway between two representable numbers near 1.0
# Between 1.0 and 1.0 + eps: midpoint is 1.0 + eps/2
v = 1.0 + eps/2 # Exact midpoint
import math
rn_even = float(np.float32(v)) # Round to nearest even (default)
trunc = math.floor(v / eps) * eps # Truncation
rup = math.ceil(v / eps) * eps # Round up
print(f'{v:14.10f} {rn_even:10.8f} {rup:10.8f} {trunc:10.8f}')
print()
print('Default IEEE 754 rounding: Round-to-nearest-even (RNE)')
print('Stochastic rounding: randomly rounds up or down, unbiased for training')
print()
# Demonstrate bias in round-half-up vs round-half-even
# Sum of 0.5, 1.5, 2.5, 3.5: should average to 2.0
half_values = [0.5, 1.5, 2.5, 3.5, 4.5, 5.5]
def round_half_up(x): return math.floor(x + 0.5)
def round_half_even(x): return round(x) # Python uses banker's rounding
rhu = [round_half_up(v) for v in half_values]
rhe = [round_half_even(v) for v in half_values]
print(f'Round-half-up values: {rhu} mean={np.mean(rhu):.3f} (biased high!)')
print(f'Round-half-even values:{rhe} mean={np.mean(rhe):.3f} (unbiased)')
11. Welford's Online Variance Algorithm
Numerically stable one-pass computation of mean and variance.
Code cell 38
# === 11.1 Welford vs Naive Variance ===
def welford_variance(data):
"""Compute mean and variance using Welford's online algorithm."""
n = 0
mean = 0.0
M2 = 0.0 # Sum of squared deviations
for x in data:
n += 1
delta = x - mean
mean += delta / n
delta2 = x - mean
M2 += delta * delta2
variance = M2 / (n - 1) if n > 1 else 0.0
return mean, variance
def naive_variance(data):
"""Naive two-pass variance: Var = E[X^2] - E[X]^2."""
n = len(data)
mean = sum(data) / n
return sum((x - mean)**2 for x in data) / (n - 1)
def naive_one_pass_variance(data):
"""One-pass naive: E[X^2] - (E[X])^2 — catastrophic cancellation!"""
n = len(data)
s1 = sum(data)
s2 = sum(x**2 for x in data)
return (s2 - s1**2 / n) / (n - 1)
# Test case: values with a large mean (catastrophic cancellation in naive)
# X ~ N(1e8, 1) — large mean, unit variance
np.random.seed(42)
data = list(1e8 + np.random.randn(10000).astype(np.float32))
true_variance = np.var(np.array(data, dtype=np.float64), ddof=1)
mean_w, var_w = welford_variance(data)
var_naive = naive_variance(data)
var_onepass = naive_one_pass_variance(data)
print(f'True variance (fp64): {true_variance:.8f}')
print(f'Welford variance: {var_w:.8f} error: {abs(var_w - true_variance):.2e}')
print(f'Naive 2-pass: {var_naive:.8f} error: {abs(var_naive - true_variance):.2e}')
print(f'Naive 1-pass: {var_onepass:.8f} error: {abs(var_onepass - true_variance):.2e}')
ok_w = abs(var_w - true_variance) < 0.01
ok_op = abs(var_onepass - true_variance) > abs(var_w - true_variance)
print(f"{'PASS' if ok_w else 'FAIL'} - Welford is accurate")
print(f"{'PASS' if ok_op else 'FAIL'} - naive 1-pass is less accurate than Welford")
12. Gradient Vanishing and Exploding in Floating Point
Simulate how floating-point precision limits gradient flow in deep networks.
Code cell 40
# === 12.1 Gradient Magnitude Through Layers ===
# Simulate backpropagation through a deep linear network
# Each layer: gradient multiplied by W^T
# For Gaussian weights with std sigma, E[||W^T g||^2] = sigma^2 * fan_in * ||g||^2
def simulate_gradient_flow(n_layers, sigma, n_neurons, dtype=np.float32, n_trials=50):
"""Simulate gradient norms through n_layers of random linear layers."""
norms = []
np.random.seed(42)
for _ in range(n_trials):
g = np.ones(n_neurons, dtype=dtype)
layer_norms = [np.linalg.norm(g)]
for _ in range(n_layers):
W = np.random.randn(n_neurons, n_neurons).astype(dtype) * sigma
g = W.T @ g
layer_norms.append(np.linalg.norm(g))
norms.append(layer_norms)
return np.mean(norms, axis=0)
n_neurons = 32
n_layers = 30
# Three scenarios: vanishing, stable (Xavier), exploding
sigma_vanish = 0.1 / np.sqrt(n_neurons)
sigma_xavier = 1.0 / np.sqrt(n_neurons)
sigma_explode = 3.0 / np.sqrt(n_neurons)
norms_v = simulate_gradient_flow(n_layers, sigma_vanish, n_neurons)
norms_x = simulate_gradient_flow(n_layers, sigma_xavier, n_neurons)
norms_e = simulate_gradient_flow(n_layers, sigma_explode, n_neurons)
print(f'Gradient norms after {n_layers} layers:')
print(f' Vanishing (sigma={sigma_vanish:.4f}): {norms_v[-1]:.4e}')
print(f' Xavier (sigma={sigma_xavier:.4f}): {norms_x[-1]:.4f}')
print(f' Exploding (sigma={sigma_explode:.4f}): {norms_e[-1]:.4e}')
# fp16 underflow check
fp16_min = float(np.finfo(np.float16).tiny)
print(f'\nfp16 min normal: {fp16_min:.2e}')
print(f'Vanishing grad underflows fp16: {norms_v[-1] < fp16_min}')
Code cell 41
# === 12.2 Visualization: Gradient Flow ===
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
layers = np.arange(n_layers + 1)
ax = axes[0]
ax.semilogy(layers, norms_v + 1e-45, color=COLORS['error'], label='Vanishing (sigma=0.1/√n)')
ax.semilogy(layers, norms_x, color=COLORS['primary'], label='Xavier (sigma=1/√n)')
ax.semilogy(layers, norms_e, color=COLORS['secondary'], label='Exploding (sigma=3/√n)')
ax.axhline(float(np.finfo(np.float16).tiny), color=COLORS['neutral'],
linestyle=':', label='fp16 underflow')
ax.axhline(float(np.finfo(np.float16).max), color=COLORS['highlight'],
linestyle=':', label='fp16 overflow')
ax.set_xlabel('Layer depth')
ax.set_ylabel('Gradient norm (log scale)')
ax.set_title('Gradient flow through deep network')
ax.legend(fontsize=9)
ax = axes[1]
ax.plot(layers, norms_x, color=COLORS['primary'], label='Xavier (fp32)')
# Simulate fp16
norms_x_fp16 = simulate_gradient_flow(n_layers, sigma_xavier, n_neurons, dtype=np.float16)
ax.plot(layers, norms_x_fp16.astype(np.float32), color=COLORS['secondary'],
linestyle='--', label='Xavier (fp16)')
ax.set_xlabel('Layer depth')
ax.set_ylabel('Gradient norm')
ax.set_title('fp32 vs fp16 gradient norms')
ax.legend()
fig.tight_layout()
plt.show()
print('Plot displayed.')
13. Summary and Verification
Consolidate all key concepts with a final verification suite.
Code cell 43
# === 13.1 Comprehensive Verification Suite ===
results = {}
# 1. Machine epsilon
eps32 = np.finfo(np.float32).eps
results['machine_epsilon'] = np.isclose(eps32, 2**-23)
# 2. Overflow detection
big = np.float32(3.4e38)
overflow = np.isinf(big * np.float32(2.0))
results['overflow_detection'] = overflow
# 3. Kahan summation
arr = np.full(10000, np.float32(1e-7))
naive_s = np.float32(0.0)
for x in arr: naive_s += x
kahan_s = kahan_sum_fp32(arr)
true_s = 1e-7 * 10000
results['kahan_better_than_naive'] = abs(kahan_s - true_s) < abs(float(naive_s) - true_s)
# 4. Stable logsumexp
z = np.array([1000.0, 1001.0, 999.0], dtype=np.float32)
lse_stable = logsumexp_stable(z)
lse_ref = float(np.log(np.sum(np.exp(z.astype(np.float64)))))
results['logsumexp_stable'] = np.isclose(lse_stable, lse_ref, rtol=1e-4)
# 5. Softmax translation invariance
z = np.array([1.0, 2.0, 3.0])
s1 = softmax_stable(z)
s2 = softmax_stable(z - 100)
results['softmax_translation_invariant'] = np.allclose(s1, s2)
# 6. Cancellation in quadratic formula
r2_good = np.float64(1.0) / r1_stable # Vieta's formula
r2_naive_err = abs(r2_standard - r2_true) / r2_true
r2_stable_err = abs(r2_good - r2_true) / r2_true
results['vieta_formula_stable'] = r2_stable_err < r2_naive_err * 0.001
# 7. NaN propagation
nan = np.float32('nan')
results['nan_propagates'] = np.isnan(nan + 1.0)
# 8. Hilbert matrix ill-conditioned
H4 = build_hilbert(4)
results['hilbert_ill_conditioned'] = nla.cond(H4) > 1e4
# Print results
print('=== Verification Suite ===')
all_pass = True
for name, passed in results.items():
status = 'PASS' if passed else 'FAIL'
all_pass = all_pass and passed
print(f' [{status}] {name}')
print()
print(f'Overall: {"ALL PASS" if all_pass else "SOME FAILURES"}')
print(f'Passed: {sum(results.values())}/{len(results)}')
Key Takeaways
| Concept | Rule of Thumb |
|---|---|
| Machine epsilon | fp32: ; bf16: |
| Catastrophic cancellation | Occurs when subtracting nearly-equal quantities; use algebraic identities |
| Kahan summation | Reduces error to using compensation |
| Condition number | tells how many digits are lost in ; loses digits |
| Mixed precision | bf16 compute + fp32 master weights is standard for LLM training |
| Loss scaling | Essential for fp16; multiply loss by , check for overflow, divide gradients by |
| Stable softmax | Always subtract max before exp; use log-sum-exp for numerical log-softmax |
| expm1/log1p | Use these for and when |