Theory Notebook
1 min read18 headings
Theory Notebook
Converted from
theory.ipynbfor web reading.
Neural Networks: Theory Notebook
This notebook makes neural-network math executable: forward passes, backprop, activations, initialization, optimization, normalization, dropout, and diagnostics.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
1. Affine layer and ReLU
Code cell 4
X = np.array([[1.0, -2.0], [0.5, 1.0]])
W = np.array([[0.2, -0.4], [1.0, 0.5], [-0.3, 0.8]])
b = np.array([0.1, -0.2, 0.3])
Z = X @ W.T + b
H = np.maximum(0, Z)
print("Z:\n", np.round(Z, 3))
print("ReLU(H):\n", np.round(H, 3))
2. Two-layer forward pass
Code cell 6
rng = np.random.default_rng(1)
X = rng.normal(size=(4, 3))
W1 = rng.normal(scale=0.5, size=(5, 3))
b1 = np.zeros(5)
W2 = rng.normal(scale=0.5, size=(2, 5))
b2 = np.zeros(2)
H = np.maximum(0, X @ W1.T + b1)
logits = H @ W2.T + b2
print("hidden shape:", H.shape)
print("logits shape:", logits.shape)
3. Softmax cross-entropy
Code cell 8
logits = np.array([[2.0, 1.0, -1.0], [0.1, 0.2, 0.3]])
targets = np.array([0, 2])
shifted = logits - logits.max(axis=1, keepdims=True)
probs = np.exp(shifted) / np.exp(shifted).sum(axis=1, keepdims=True)
loss = -np.log(probs[np.arange(len(targets)), targets]).mean()
print("probs:\n", np.round(probs, 3))
print("loss:", loss)
4. Backprop through affine layer
Code cell 10
X = rng.normal(size=(3, 4))
W = rng.normal(size=(2, 4))
dZ = rng.normal(size=(3, 2))
dW = dZ.T @ X
db = dZ.sum(axis=0)
dX = dZ @ W
print("dW shape:", dW.shape)
print("db shape:", db.shape)
print("dX shape:", dX.shape)
5. ReLU derivative
Code cell 12
z = np.array([-2.0, 0.0, 3.0])
relu = np.maximum(0, z)
drelu = (z > 0).astype(float)
print("relu:", relu)
print("derivative convention:", drelu)
6. Finite-difference gradient check
Code cell 14
x = np.array([1.0, -2.0])
w = np.array([0.3, 0.7])
y = 1.5
def loss(wv):
pred = wv @ x
return 0.5 * (pred - y) ** 2
analytic = (w @ x - y) * x
eps = 1e-6
numeric = np.zeros_like(w)
for i in range(len(w)):
e = np.zeros_like(w)
e[i] = eps
numeric[i] = (loss(w + e) - loss(w - e)) / (2 * eps)
print("analytic:", analytic)
print("numeric:", numeric)
print("max error:", np.max(np.abs(analytic - numeric)))
7. Initialization scale
Code cell 16
fan_in, fan_out = 256, 512
xavier_std = np.sqrt(2 / (fan_in + fan_out))
he_std = np.sqrt(2 / fan_in)
print("Xavier std:", xavier_std)
print("He std:", he_std)
8. Activation variance through depth
Code cell 18
rng = np.random.default_rng(2)
B, d, depth = 512, 256, 20
for name, std in [("too small", 0.02), ("He", np.sqrt(2/d)), ("too large", 0.2)]:
h = rng.normal(size=(B, d))
variances = []
for _ in range(depth):
W = rng.normal(scale=std, size=(d, d))
h = np.maximum(0, h @ W.T)
variances.append(h.var())
print(name, "final variance:", variances[-1])
9. SGD with momentum
Code cell 20
theta = np.array([2.0, -1.0])
v = np.zeros_like(theta)
beta = 0.9
lr = 0.1
for step in range(5):
grad = theta
v = beta * v + grad
theta = theta - lr * v
print(step, np.round(theta, 4))
10. Dropout with inverted scaling
Code cell 22
rng = np.random.default_rng(3)
h = np.ones(10)
p_drop = 0.3
mask = (rng.random(size=h.shape) > p_drop).astype(float)
h_drop = h * mask / (1 - p_drop)
print("mask:", mask.astype(int))
print("mean after inverted dropout:", h_drop.mean())
11. LayerNorm
Code cell 24
x = np.array([[1.0, 2.0, 4.0], [2.0, 2.0, 2.0]])
mean = x.mean(axis=1, keepdims=True)
var = x.var(axis=1, keepdims=True)
y = (x - mean) / np.sqrt(var + 1e-5)
print("LayerNorm:\n", np.round(y, 4))
12. Train/validation diagnostic
Code cell 26
steps = np.arange(80)
train = np.exp(-steps / 25) + 0.05
val = np.exp(-steps / 35) + 0.08 + 0.002 * np.maximum(steps - 45, 0)
plt.plot(steps, train, label="train")
plt.plot(steps, val, label="validation")
plt.title("Training and validation curves")
plt.xlabel("step")
plt.ylabel("loss")
plt.legend()
plt.tight_layout()
plt.show()
13. Final checklist
Code cell 28
checks = [
"forward shapes match expected layer dimensions",
"loss decreases on a tiny overfit batch",
"gradients pass finite-difference checks for small modules",
"activation means and variances are healthy",
"gradient norms are tracked by layer",
"train and validation curves are interpreted together",
]
for i, check in enumerate(checks, 1):
print(f"{i}. {check}")