Theory NotebookMath for LLMs

Neural Networks

Math for Specific Models / Neural Networks

Run notebook
Private notes
0/8000

Notes stay private to your browser until account sync is configured.

Theory Notebook
1 min read18 headings

Theory Notebook

Converted from theory.ipynb for web reading.

Neural Networks: Theory Notebook

This notebook makes neural-network math executable: forward passes, backprop, activations, initialization, optimization, normalization, dropout, and diagnostics.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

1. Affine layer and ReLU

Code cell 4

X = np.array([[1.0, -2.0], [0.5, 1.0]])
W = np.array([[0.2, -0.4], [1.0, 0.5], [-0.3, 0.8]])
b = np.array([0.1, -0.2, 0.3])
Z = X @ W.T + b
H = np.maximum(0, Z)
print("Z:\n", np.round(Z, 3))
print("ReLU(H):\n", np.round(H, 3))

2. Two-layer forward pass

Code cell 6

rng = np.random.default_rng(1)
X = rng.normal(size=(4, 3))
W1 = rng.normal(scale=0.5, size=(5, 3))
b1 = np.zeros(5)
W2 = rng.normal(scale=0.5, size=(2, 5))
b2 = np.zeros(2)
H = np.maximum(0, X @ W1.T + b1)
logits = H @ W2.T + b2
print("hidden shape:", H.shape)
print("logits shape:", logits.shape)

3. Softmax cross-entropy

Code cell 8

logits = np.array([[2.0, 1.0, -1.0], [0.1, 0.2, 0.3]])
targets = np.array([0, 2])
shifted = logits - logits.max(axis=1, keepdims=True)
probs = np.exp(shifted) / np.exp(shifted).sum(axis=1, keepdims=True)
loss = -np.log(probs[np.arange(len(targets)), targets]).mean()
print("probs:\n", np.round(probs, 3))
print("loss:", loss)

4. Backprop through affine layer

Code cell 10

X = rng.normal(size=(3, 4))
W = rng.normal(size=(2, 4))
dZ = rng.normal(size=(3, 2))
dW = dZ.T @ X
db = dZ.sum(axis=0)
dX = dZ @ W
print("dW shape:", dW.shape)
print("db shape:", db.shape)
print("dX shape:", dX.shape)

5. ReLU derivative

Code cell 12

z = np.array([-2.0, 0.0, 3.0])
relu = np.maximum(0, z)
drelu = (z > 0).astype(float)
print("relu:", relu)
print("derivative convention:", drelu)

6. Finite-difference gradient check

Code cell 14

x = np.array([1.0, -2.0])
w = np.array([0.3, 0.7])
y = 1.5
def loss(wv):
    pred = wv @ x
    return 0.5 * (pred - y) ** 2
analytic = (w @ x - y) * x
eps = 1e-6
numeric = np.zeros_like(w)
for i in range(len(w)):
    e = np.zeros_like(w)
    e[i] = eps
    numeric[i] = (loss(w + e) - loss(w - e)) / (2 * eps)
print("analytic:", analytic)
print("numeric:", numeric)
print("max error:", np.max(np.abs(analytic - numeric)))

7. Initialization scale

Code cell 16

fan_in, fan_out = 256, 512
xavier_std = np.sqrt(2 / (fan_in + fan_out))
he_std = np.sqrt(2 / fan_in)
print("Xavier std:", xavier_std)
print("He std:", he_std)

8. Activation variance through depth

Code cell 18

rng = np.random.default_rng(2)
B, d, depth = 512, 256, 20
for name, std in [("too small", 0.02), ("He", np.sqrt(2/d)), ("too large", 0.2)]:
    h = rng.normal(size=(B, d))
    variances = []
    for _ in range(depth):
        W = rng.normal(scale=std, size=(d, d))
        h = np.maximum(0, h @ W.T)
        variances.append(h.var())
    print(name, "final variance:", variances[-1])

9. SGD with momentum

Code cell 20

theta = np.array([2.0, -1.0])
v = np.zeros_like(theta)
beta = 0.9
lr = 0.1
for step in range(5):
    grad = theta
    v = beta * v + grad
    theta = theta - lr * v
    print(step, np.round(theta, 4))

10. Dropout with inverted scaling

Code cell 22

rng = np.random.default_rng(3)
h = np.ones(10)
p_drop = 0.3
mask = (rng.random(size=h.shape) > p_drop).astype(float)
h_drop = h * mask / (1 - p_drop)
print("mask:", mask.astype(int))
print("mean after inverted dropout:", h_drop.mean())

11. LayerNorm

Code cell 24

x = np.array([[1.0, 2.0, 4.0], [2.0, 2.0, 2.0]])
mean = x.mean(axis=1, keepdims=True)
var = x.var(axis=1, keepdims=True)
y = (x - mean) / np.sqrt(var + 1e-5)
print("LayerNorm:\n", np.round(y, 4))

12. Train/validation diagnostic

Code cell 26

steps = np.arange(80)
train = np.exp(-steps / 25) + 0.05
val = np.exp(-steps / 35) + 0.08 + 0.002 * np.maximum(steps - 45, 0)
plt.plot(steps, train, label="train")
plt.plot(steps, val, label="validation")
plt.title("Training and validation curves")
plt.xlabel("step")
plt.ylabel("loss")
plt.legend()
plt.tight_layout()
plt.show()

13. Final checklist

Code cell 28

checks = [
    "forward shapes match expected layer dimensions",
    "loss decreases on a tiny overfit batch",
    "gradients pass finite-difference checks for small modules",
    "activation means and variances are healthy",
    "gradient norms are tracked by layer",
    "train and validation curves are interpreted together",
]
for i, check in enumerate(checks, 1):
    print(f"{i}. {check}")

Skill Check

Test this lesson

Answer 4 quick questions to lock in the lesson and feed your adaptive practice queue.

--
Score
0/4
Answered
Not attempted
Status
1

Which module does this lesson belong to?

2

Which section is covered in this lesson content?

3

Which term is most central to this lesson?

4

What is the best way to use this lesson for real learning?

Your answers save locally first, then sync when account storage is available.
Practice queue