Exercises Notebook
Converted from
exercises.ipynbfor web reading.
Derivatives and Differentiation - Exercises
10 graded exercises covering the full section arc, from core calculus mechanics to ML-facing applications.
| Format | Description |
|---|---|
| Problem | Markdown cell with task description |
| Your Solution | Code cell for learner work |
| Solution | Reference solution with checks |
Difficulty: straightforward -> moderate -> challenging.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import numpy as np
import numpy.linalg as la
from scipy import integrate, special, stats
from math import factorial
import matplotlib.patches as patches
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
HAS_MPL = True
np.set_printoptions(precision=8, suppress=True)
np.random.seed(42)
def header(title):
print("\n" + "=" * len(title))
print(title)
print("=" * len(title))
def check_true(name, cond):
ok = bool(cond)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
return ok
def check_close(name, got, expected, tol=1e-8):
ok = np.allclose(got, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} - {name}: got {got}, expected {expected}")
return ok
def centered_diff(f, x, h=1e-6):
return (f(x + h) - f(x - h)) / (2 * h)
def forward_diff(f, x, h=1e-6):
return (f(x + h) - f(x)) / h
def backward_diff(f, x, h=1e-6):
return (f(x) - f(x - h)) / h
def grad_check(f, x, analytic_grad, h=1e-6):
x = np.asarray(x, dtype=float)
analytic_grad = np.asarray(analytic_grad, dtype=float)
numeric_grad = np.zeros_like(x, dtype=float)
for idx in np.ndindex(x.shape):
x_plus = x.copy(); x_minus = x.copy()
x_plus[idx] += h; x_minus[idx] -= h
numeric_grad[idx] = (f(x_plus) - f(x_minus)) / (2 * h)
denom = la.norm(analytic_grad) + la.norm(numeric_grad) + 1e-12
return la.norm(analytic_grad - numeric_grad) / denom
def check(name, got, expected, tol=1e-8):
return check_close(name, got, expected, tol=tol)
print("Chapter helper setup complete.")
Exercise 1 (★): Basic Differentiation Rules
Implement analytical derivatives for three functions using the power, product, and quotient rules.
(a)
(b) (product rule)
(c) (quotient rule, )
For each, implement the derivative analytically, then verify against centered_diff.
Code cell 5
# Your Solution
# Exercise 1 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 1.")
Code cell 6
# Solution
# Exercise 1 - reference solution
def f(x):
return 3*x**4 - 2*x**2 + 7*x - 1
def f_prime(x):
return 12*x**3 - 4*x + 7
def g(x):
return np.sqrt(x) * np.exp(x)
def g_prime(x):
# Product rule: u=sqrt(x), v=e^x => u'v + uv' = 1/(2sqrt(x))*e^x + sqrt(x)*e^x
return (1/(2*np.sqrt(x))) * np.exp(x) + np.sqrt(x) * np.exp(x)
def h(x):
return (x**2 + 1) / (x - 1)
def h_prime(x):
# Quotient rule: (f'g - fg')/g^2 = (2x*(x-1) - (x^2+1)*1)/(x-1)^2
return (2*x*(x-1) - (x**2+1)) / (x-1)**2
x_test = 2.0
header('Exercise 1: Basic Differentiation Rules')
check_close('f\'(2.0) = 12(8)-8+7 = 95', f_prime(x_test), 95.0)
check_close('f\' matches numerical', f_prime(x_test), centered_diff(f, x_test))
check_close('g\' matches numerical', g_prime(x_test), centered_diff(g, x_test))
check_close('h\' matches numerical', h_prime(x_test), centered_diff(h, x_test))
# Verify at multiple points
x_pts = np.array([0.5, 1.0, 2.0, 3.5])
for fn, fn_prime, fn_name in [(f, f_prime, 'f'), (g, g_prime, 'g')]:
errs = [abs(fn_prime(x) - centered_diff(fn, x)) for x in x_pts]
check_true(f'{fn_name}\'(x) accurate at 4 test points', all(e < 1e-5 for e in errs))
print('\nTakeaway: Power rule gives polynomial derivatives; product/quotient rules handle compositions.')
print("Exercise 1 solution complete.")
Exercise 2 (★): Chain Rule
Differentiate using the chain rule and verify numerically.
(a)
(b) for
(c) (Gaussian kernel)
(d)
Identify the outer and inner functions for each.
Code cell 8
# Your Solution
# Exercise 2 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 2.")
Code cell 9
# Solution
# Exercise 2 - reference solution
def p(x): return np.sin(x**3)
def p_prime(x): return np.cos(x**3) * 3*x**2 # outer=sin, inner=x^3
def q(x): return np.log(np.cos(x))
def q_prime(x): return -np.sin(x) / np.cos(x) # = -tan(x)
def r(x): return np.exp(-x**2/2)
def r_prime(x): return np.exp(-x**2/2) * (-x) # outer=exp, inner=-x^2/2
def s(x): return (1 + x**2)**10
def s_prime(x): return 10*(1+x**2)**9 * 2*x # outer=u^10, inner=1+x^2
x_test = 1.0
header('Exercise 2: Chain Rule')
check_close('p\' matches numerical at x=1', p_prime(x_test), centered_diff(p, x_test))
check_close('q\' matches numerical at x=0.5', q_prime(0.5), centered_diff(q, 0.5))
check_close('r\' matches numerical at x=1', r_prime(x_test), centered_diff(r, x_test))
check_close('s\' matches numerical at x=1', s_prime(x_test), centered_diff(s, x_test))
# q'(x) = -tan(x) — verify this identity
check_close('q\'(0.5) = -tan(0.5)', q_prime(0.5), -np.tan(0.5))
print('\nTakeaway: Chain rule = differentiate outside × keep inside × derivative of inside.')
print("Exercise 2 solution complete.")
Exercise 3 (★): Implicit Differentiation
The ellipse defines implicitly as a function of .
(a) Differentiate both sides implicitly with respect to to find .
(b) At the point on the upper ellipse, compute the slope of the tangent line.
(c) Verify numerically: define explicitly and compare your formula to centered_diff(y, 1.0).
(d) Find all points where the tangent is horizontal (slope = 0).
Code cell 11
# Your Solution
# Exercise 3 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 3.")
Code cell 12
# Solution
# Exercise 3 - reference solution
import numpy as np
# (a) Implicit: x^2/4 + y^2/9 = 1
# Differentiate: x/2 + 2y/9 * y' = 0 => y' = -9x/(4y)
def dydx_ellipse(x, y):
return -9*x / (4*y)
# (b)
x0 = 1.0
y0 = 3 * np.sqrt(1 - x0**2/4)
slope_at_point = dydx_ellipse(x0, y0)
# (c) Explicit derivative for verification
def y_upper(x): return 3 * np.sqrt(1 - x**2/4)
numerical_slope = (y_upper(x0+1e-6) - y_upper(x0-1e-6)) / 2e-6
header('Exercise 3: Implicit Differentiation — Ellipse')
print(f'Point: ({x0}, {y0:.6f})')
check_close('Implicit slope at (1, y0)', slope_at_point, numerical_slope)
# Verify formula at multiple points
for x in [0.5, 1.0, 1.5]:
y = 3*np.sqrt(1-x**2/4)
a = dydx_ellipse(x, y)
n = (y_upper(x+1e-6) - y_upper(x-1e-6)) / 2e-6
check_close(f'slope at x={x}', a, n, tol=1e-5)
# (d) Horizontal tangents: dy/dx = 0 => x=0
x_horiz = 0.0
y_top = 3.0 # y(0) = 3
y_bot = -3.0
check_true('Horizontal tangent at (0, ±3)', abs(dydx_ellipse(x_horiz, y_top)) < 1e-10)
print('\nTakeaway: Implicit differentiation avoids solving for y — differentiate the equation directly.')
print("Exercise 3 solution complete.")
Exercise 4 (★★): Activation Function Derivatives
(a) Derive from scratch using the quotient rule.
(b) Implement sigmoid and its derivative, verify at .
(c) Derive . Verify numerically.
(d) For a network with sigmoid layers, each with pre-activation near , compute the product for . Explain the vanishing gradient problem.
Code cell 14
# Your Solution
# Exercise 4 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 4.")
Code cell 15
# Solution
# Exercise 4 - reference solution
import numpy as np
def sigmoid(x):
return np.where(x >= 0, 1/(1+np.exp(-x)), np.exp(x)/(1+np.exp(x)))
def sigmoid_prime(x):
s = sigmoid(x)
return s * (1 - s)
def tanh_prime(x):
return 1 - np.tanh(x)**2
header('Exercise 4: Activation Function Derivatives')
# (b) Verify sigmoid'
for x in [-3.0, 0.0, 3.0]:
a = sigmoid_prime(x)
n = (sigmoid(x+1e-6) - sigmoid(x-1e-6)) / 2e-6
check_close(f'sigma\'({x:+.0f})', a, n, tol=1e-5)
# Peak at x=0: 1/4
check_close('sigma\'(0) = 0.25', sigmoid_prime(0.0), 0.25)
# (c) Verify tanh'
for x in [-1.0, 0.0, 1.5]:
a = tanh_prime(x)
n = (np.tanh(x+1e-6) - np.tanh(x-1e-6)) / 2e-6
check_close(f'tanh\'({x:+.1f})', a, n, tol=1e-5)
# (d) Vanishing gradient
print('\nVanishing gradient at z=2.0:')
z = 2.0
sp = sigmoid_prime(z)
print(f'sigma\'(2.0) = {sp:.4f}')
for L in [1, 5, 10, 20]:
product = sp**L
check_true(f'L={L:2d}: product={product:.2e} (vanishing for L>=5)', L < 4 or product < 1e-2)
print('\nTakeaway: Sigmoid gradients max at 0.25 — products vanish exponentially with depth.')
print("Exercise 4 solution complete.")
Exercise 5 (★★): Critical Points and Extrema
Consider .
(a) Find all critical points analytically (where ).
(b) Apply the second derivative test. What does it tell you? Is the test conclusive?
(c) Apply the first derivative test (check sign of on both sides of the critical point).
(d) Find the global minimum and maximum of on .
(e) Plot , , and on .
Code cell 17
# Your Solution
# Exercise 5 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 5.")
Code cell 18
# Solution
# Exercise 5 - reference solution
import numpy as np
def f(x): return (x-1)**4
def f_prime(x): return 4*(x-1)**3
def f_double_prime(x): return 12*(x-1)**2
header('Exercise 5: Critical Points and Extrema')
# (a) Critical points: f'(x) = 4(x-1)^3 = 0 => x=1 (triple root)
x_crit = 1.0
check_close('f\'(1) = 0', f_prime(x_crit), 0.0)
# (b) Second derivative test
fpp_crit = f_double_prime(x_crit)
print(f'f\'\'(1) = {fpp_crit} — INCONCLUSIVE (= 0, but f = (x-1)^4 >= 0)')
# (c) First derivative test
# f'(0.5) = 4(-0.5)^3 = -0.5 < 0 (decreasing before)
# f'(1.5) = 4(0.5)^3 = 0.5 > 0 (increasing after)
# sign changes - to + => LOCAL MINIMUM
check_true('f\' sign change: - to + at x=1 (local min)', f_prime(0.5) < 0 and f_prime(1.5) > 0)
# (d) Global extrema on [-1, 3]
candidates = {'x=-1': (-1.0, f(-1.0)), 'x=1 (crit)': (1.0, f(1.0)), 'x=3': (3.0, f(3.0))}
for name, (x, fx) in candidates.items():
print(f' {name}: f = {fx:.4f}')
check_close('Global min = f(1) = 0', f(1.0), 0.0)
check_close('Global max = f(-1) = 16 or f(3) = 16', f(-1.0), f(3.0)) # both = 16
# (e) Plot
try:
import matplotlib.pyplot as plt
x = np.linspace(-0.5, 2.5, 300)
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for ax, fn, nm in zip(axes, [f, f_prime, f_double_prime], ['f(x)', "f'(x)", "f''(x)"]):
ax.plot(x, fn(x), lw=2); ax.axhline(0, color='k', lw=0.7)
ax.axvline(1, color='r', ls='--', alpha=0.5, label='x=1')
ax.set_title(nm); ax.legend()
plt.tight_layout(); plt.show()
except ImportError:
pass
print('\nTakeaway: f\'\'=0 is INCONCLUSIVE — always verify with sign of f\' or higher derivatives.')
print("Exercise 5 solution complete.")
Exercise 6 (★★): Mean Value Theorem Application
(a) Prove that for all using the MVT.
(b) Use the MVT to show that for .
(c) For on , find all guaranteed by the MVT and verify numerically.
(d) The gradient descent descent lemma: if for all , show that (gradient is -Lipschitz). Verify for .
Code cell 20
# Your Solution
# Exercise 6 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 6.")
Code cell 21
# Solution
# Exercise 6 - reference solution
import numpy as np
def f(x): return x**3 - x
def f_prime(x): return 3*x**2 - 1
header('Exercise 6: Mean Value Theorem')
# (a) |sin a - sin b| <= |a-b| — verify numerically
np.random.seed(42)
for _ in range(20):
a, b = np.random.uniform(-10, 10, 2)
lhs = abs(np.sin(a) - np.sin(b))
rhs = abs(a - b)
if lhs > rhs + 1e-10:
print(f'FAIL at a={a:.2f}, b={b:.2f}')
break
else:
print('PASS — |sin a - sin b| <= |a-b| verified on 20 random pairs')
# (c) Find MVT c values for f(x)=x^3-x on [-2,2]
a, b = -2.0, 2.0
secant_slope = (f(b) - f(a)) / (b - a)
print(f'\nSecant slope: {secant_slope:.4f}')
# 3c^2 - 1 = secant_slope => c^2 = (secant_slope + 1)/3
c_sq = (secant_slope + 1) / 3
c_pos = np.sqrt(c_sq); c_neg = -np.sqrt(c_sq)
print(f'MVT c values: ±{c_pos:.4f}')
check_close('f\'(c_pos) = secant slope', f_prime(c_pos), secant_slope)
check_close('f\'(c_neg) = secant slope', f_prime(c_neg), secant_slope)
check_true('c_pos in (-2,2)', -2 < c_pos < 2)
# (d) L-Lipschitz gradient for g(x)=x^2
def g(x): return x**2
def g_prime(x): return 2*x
# g''(x) = 2 everywhere, so L = 2
L = 2.0
x1, x2 = 1.0, 3.0
lhs = abs(g_prime(x1) - g_prime(x2))
rhs = L * abs(x1 - x2)
print(f'\nLipschitz bound: {lhs:.4f} <= {rhs:.4f}')
check_true('|g\'(x1)-g\'(x2)| <= L|x1-x2|', lhs <= rhs + 1e-10)
print('\nTakeaway: MVT provides Lipschitz bounds that determine safe learning rates (eta <= 1/L).')
print("Exercise 6 solution complete.")
Exercise 7 (★★★): Numerical Differentiation and Gradient Checking
(a) Implement forward, backward, and centered finite differences for .
(b) For , compute the analytic derivative and verify against each finite difference method at .
(c) Experiment with . Record the error for each. Identify the optimal for centered differences in float64.
(d) Implement grad_check_vector for a vector-to-scalar function. Test it on the L2 loss at , .
Code cell 23
# Your Solution
# Exercise 7 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 7.")
Code cell 24
# Solution
# Exercise 7 - reference solution
import numpy as np
def forward_diff(f, x, h): return (f(x+h) - f(x)) / h
def backward_diff(f, x, h): return (f(x) - f(x-h)) / h
def centered_diff_h(f, x, h): return (f(x+h) - f(x-h)) / (2*h)
def f(x): return x * np.log(x + np.exp(-x))
def f_prime_analytic(x):
# f = x * ln(x + e^{-x})
# product rule: 1*ln(x+e^-x) + x * (1 - e^-x)/(x + e^-x)
return np.log(x + np.exp(-x)) + x * (1 - np.exp(-x)) / (x + np.exp(-x))
header('Exercise 7: Gradient Checking')
x = 1.5
true_deriv = f_prime_analytic(x)
print(f'Analytic f\'(1.5) = {true_deriv:.8f}')
h_vals = [1e-1, 1e-3, 1e-6, 1e-10, 1e-15]
print(f'\n{"h":>8} {"Forward err":>14} {"Centered err":>14}')
best_ctr_h = None; best_ctr_err = np.inf
for h in h_vals:
fwd_err = abs(forward_diff(f, x, h) - true_deriv)
ctr_err = abs(centered_diff_h(f, x, h) - true_deriv)
if ctr_err < best_ctr_err:
best_ctr_err = ctr_err; best_ctr_h = h
print(f'{h:8.0e} {fwd_err:14.2e} {ctr_err:14.2e}')
print(f'\nOptimal h for centered: {best_ctr_h:.0e} (theoretical: ~1e-6)')
# (d) Vector grad check
def L2_loss(w): return np.sum(w**2)
def L2_grad(w): return 2*w
w = np.array([1.0, 2.0, 3.0])
err = grad_check(L2_loss, w, L2_grad(w))
print(f'\nVector gradient check relative error: {err:.2e}')
check_true('Gradient check passes (err < 1e-5)', err < 1e-5)
print('\nTakeaway: Centered differences have O(h^2) error — optimal h ≈ 1e-6 for float64.')
print("Exercise 7 solution complete.")
Exercise 8 (★★★): Backpropagation via Chain Rule
A 2-layer network with sigmoid activations and MSE loss:
(a) Derive and using the chain rule.
(b) Derive and (requires backpropagating through 2 sigmoid layers).
(c) Implement the forward and backward passes. Verify all 4 gradients against numerical estimates.
(d) Run 200 gradient descent steps with on the single example . Plot the loss curve.
Code cell 26
# Your Solution
# Exercise 8 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 8.")
Code cell 27
# Solution
# Exercise 8 - reference solution
import numpy as np
def sigmoid(x): return 1/(1+np.exp(-np.clip(x,-500,500)))
def sigmoid_prime(x): s = sigmoid(x); return s*(1-s)
def forward(x, y, w1, b1, w2, b2):
z1 = w1*x + b1
a1 = sigmoid(z1)
z2 = w2*a1 + b2
a2 = sigmoid(z2)
loss = 0.5*(a2 - y)**2
return loss, {'z1':z1,'a1':a1,'z2':z2,'a2':a2,'x':x,'y':y}
def backward(cache, y, w1, b1, w2, b2):
z1,a1,z2,a2,x = cache['z1'],cache['a1'],cache['z2'],cache['a2'],cache['x']
dL_da2 = a2 - y
da2_dz2 = sigmoid_prime(z2)
dL_dz2 = dL_da2 * da2_dz2
dL_dw2 = dL_dz2 * a1
dL_db2 = dL_dz2
dL_da1 = dL_dz2 * w2
dL_dz1 = dL_da1 * sigmoid_prime(z1)
dL_dw1 = dL_dz1 * x
dL_db1 = dL_dz1
return {'w1':dL_dw1,'b1':dL_db1,'w2':dL_dw2,'b2':dL_db2}
x, y = 1.5, 0.7
w1, b1, w2, b2 = 0.5, -0.3, 1.2, 0.1
params = [w1, b1, w2, b2]
header('Exercise 8: Backpropagation')
loss, cache = forward(x, y, w1, b1, w2, b2)
grads = backward(cache, y, w1, b1, w2, b2)
# Numerical gradient check
h = 1e-5
for i, (pname, pval) in enumerate([('w1',w1),('b1',b1),('w2',w2),('b2',b2)]):
p_plus = params[:]; p_plus[i] += h
p_minus = params[:]; p_minus[i] -= h
L_p = forward(x, y, *p_plus)[0]
L_m = forward(x, y, *p_minus)[0]
num_grad = (L_p - L_m) / (2*h)
check_close(f'd{pname}', grads[pname], num_grad, tol=1e-6)
# (d) Training loop
eta = 0.5
losses = []
for _ in range(200):
loss, cache = forward(x, y, w1, b1, w2, b2)
losses.append(loss)
grads = backward(cache, y, w1, b1, w2, b2)
w1 -= eta*grads['w1']; b1 -= eta*grads['b1']
w2 -= eta*grads['w2']; b2 -= eta*grads['b2']
check_true('Loss decreased (training works)', losses[-1] < losses[0])
print(f'Initial loss: {losses[0]:.6f}, Final loss: {losses[-1]:.6f}')
try:
import matplotlib.pyplot as plt
plt.figure(figsize=(8,5))
plt.semilogy(losses, 'b-', lw=2)
plt.xlabel('Step'); plt.ylabel('Loss (log)')
plt.title('2-Layer Network Training Loss')
plt.tight_layout(); plt.show()
except ImportError:
pass
print('\nTakeaway: Backpropagation = chain rule applied layer by layer from output to input.')
print("Exercise 8 solution complete.")
Exercise 9 (★★★): Binary Cross-Entropy Gradient from Logits
For a label and logit , the binary cross-entropy is
Derive and verify the compact derivative
Code cell 29
# Your Solution
# Exercise 9 - learner workspace
# Derive d loss / d logit and verify it with finite differences.
print("Learner workspace ready for Exercise 9.")
Code cell 30
# Solution
# Exercise 9 - BCE logit gradient
header("Exercise 9: BCE gradient from logits")
def sigmoid(a):
a = np.asarray(a, dtype=float)
return np.where(a >= 0, 1 / (1 + np.exp(-a)), np.exp(a) / (1 + np.exp(a)))
def bce_from_logit(a, y):
# Stable equivalent of -y log(sigmoid(a)) - (1-y) log(1-sigmoid(a)).
return np.maximum(a, 0) - a * y + np.log1p(np.exp(-np.abs(a)))
def centered_diff(f, x, h=1e-6):
return (f(x + h) - f(x - h)) / (2 * h)
for a, y in [(-2.0, 0.0), (-0.4, 1.0), (1.7, 1.0), (3.0, 0.0)]:
analytic = float(sigmoid(a) - y)
numeric = centered_diff(lambda t: bce_from_logit(t, y), a)
print(f"a={a:+.2f}, y={y:.0f}: analytic={analytic:+.8f}, numeric={numeric:+.8f}")
check_close("gradient check", analytic, numeric, tol=1e-6)
print("Takeaway: classification backprop sends prediction error, sigmoid(a)-y, through the graph.")
Exercise 10 (★★★): Manual Backprop for a Tiny Tanh Network
Consider
Derive gradients for and verify one component by finite differences.
Code cell 32
# Your Solution
# Exercise 10 - learner workspace
# Work backward from dL/dyhat through the tanh hidden layer.
print("Learner workspace ready for Exercise 10.")
Code cell 33
# Solution
# Exercise 10 - manual backprop through tanh network
header("Exercise 10: manual backprop through tanh network")
W = np.array([[0.4, -0.2], [0.1, 0.3], [-0.5, 0.2]])
b = np.array([0.1, -0.2, 0.05])
v = np.array([0.7, -0.4, 0.2])
c = -0.1
x = np.array([1.2, -0.7])
y = 0.35
def forward(W, b, v, c, x, y):
a = W @ x + b
h = np.tanh(a)
yhat = v @ h + c
loss = 0.5 * (yhat - y)**2
return loss, (a, h, yhat)
loss, (a, h, yhat) = forward(W, b, v, c, x, y)
dyhat = yhat - y
grad_v = dyhat * h
grad_c = dyhat
grad_h = dyhat * v
grad_a = grad_h * (1 - h**2)
grad_W = np.outer(grad_a, x)
grad_b = grad_a
grad_x = W.T @ grad_a
# Finite-difference check for W[1,0].
def loss_w10(w10):
W2 = W.copy(); W2[1,0] = w10
return forward(W2, b, v, c, x, y)[0]
num = (loss_w10(W[1,0] + 1e-6) - loss_w10(W[1,0] - 1e-6)) / (2e-6)
print("loss:", loss)
print("grad_W:\n", grad_W)
print("grad_x:", grad_x)
check_close("finite difference W[1,0]", grad_W[1,0], num, tol=1e-6)
print("Takeaway: backprop is the chain rule organized so every local derivative is reused once.")
What to Review After Finishing
- Can you apply the chain rule to a 3+ layer composition without looking it up?
- Can you derive from scratch?
- Do you understand why centered differences have error vs. for forward differences?
- Can you explain the vanishing gradient problem quantitatively?
- Can you implement a full forward/backward pass for a 2-layer network?
- Do you know the second derivative test — including when it's inconclusive?
References
- Stewart, J. Calculus: Early Transcendentals, 8th ed. — Chapters 3–4
- Goodfellow, Bengio, Courville. Deep Learning — Chapter 6 (backpropagation)
- Rumelhart et al. (1986). Learning representations by back-propagating errors. Nature.
- He et al. (2015). Delving deep into rectifiers (ReLU/PReLU analysis)
- Kingma & Ba (2015). Adam: A method for stochastic optimization