Private notes
0/8000

Notes stay private to your browser until account sync is configured.

Theory Notebook
3 min read18 headings

Theory Notebook

Converted from theory.ipynb for web reading.

Reinforcement Learning

This notebook is the executable companion to notes.md. It keeps the MDPs tiny so Bellman equations, TD targets, policy gradients, PPO clipping, and preference losses can be inspected numerically.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3


COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

def header(title):
    print("\n" + "=" * 72)
    print(title)
    print("=" * 72)

def check_true(condition, name):
    ok = bool(condition)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    assert ok, name

def check_close(value, target, tol=1e-8, name="value"):
    value = float(value)
    target = float(target)
    ok = abs(value - target) <= tol
    print(f"{'PASS' if ok else 'FAIL'} - {name}: got {value:.6f}, expected {target:.6f}")
    assert ok, name

def softmax(logits):
    logits = np.asarray(logits, dtype=float)
    shifted = logits - np.max(logits)
    exp = np.exp(shifted)
    return exp / exp.sum()

def chain_mdp():
    # Three nonterminal states plus one terminal state.
    n_states, n_actions, terminal = 4, 2, 3
    P = np.zeros((n_states, n_actions, n_states))
    R = np.zeros((n_states, n_actions, n_states))
    for s in range(n_states):
        if s == terminal:
            P[s, :, terminal] = 1.0
            continue
        left = max(0, s - 1)
        right = min(terminal, s + 1)
        P[s, 0, left] = 1.0
        P[s, 1, right] = 1.0
        if right == terminal:
            R[s, 1, terminal] = 1.0
    return P, R, terminal

def policy_transition_reward(P, R, policy):
    P_pi = np.einsum("sa,san->sn", policy, P)
    r_pi = np.einsum("sa,san,san->s", policy, P, R)
    return P_pi, r_pi

def policy_evaluation(P, R, policy, gamma=0.9):
    P_pi, r_pi = policy_transition_reward(P, R, policy)
    A = np.eye(P.shape[0]) - gamma * P_pi
    return np.linalg.solve(A, r_pi)

def bellman_optimality_backup(P, R, V, gamma=0.9):
    q = np.einsum("san,san->sa", P, R + gamma * V[None, None, :])
    return q.max(axis=1), q.argmax(axis=1), q

def value_iteration(P, R, gamma=0.9, steps=25):
    V = np.zeros(P.shape[0])
    history = []
    for _ in range(steps):
        new_V, policy, q = bellman_optimality_backup(P, R, V, gamma)
        history.append(float(np.max(np.abs(new_V - V))))
        V = new_V
    return V, policy, np.array(history)

def td_update(v_s, reward, v_next, alpha=0.2, gamma=0.9, done=False):
    target = reward if done else reward + gamma * v_next
    return v_s + alpha * (target - v_s), target

def q_learning_update(q_sa, reward, next_q, alpha=0.3, gamma=0.9, done=False):
    target = reward if done else reward + gamma * np.max(next_q)
    return q_sa + alpha * (target - q_sa), target

def sarsa_update(q_sa, reward, q_next_action, alpha=0.3, gamma=0.9, done=False):
    target = reward if done else reward + gamma * q_next_action
    return q_sa + alpha * (target - q_sa), target

def discounted_returns(rewards, gamma=0.9):
    out = []
    g = 0.0
    for r in reversed(rewards):
        g = float(r) + gamma * g
        out.append(g)
    return np.array(list(reversed(out)))

def generalized_advantages(rewards, values, gamma=0.9, lam=0.95):
    rewards = np.asarray(rewards, dtype=float)
    values = np.asarray(values, dtype=float)
    adv = np.zeros_like(rewards)
    gae = 0.0
    for t in reversed(range(len(rewards))):
        delta = rewards[t] + gamma * values[t + 1] - values[t]
        gae = delta + gamma * lam * gae
        adv[t] = gae
    return adv

def ppo_clip_objective(ratio, advantage, eps=0.2):
    unclipped = ratio * advantage
    clipped = np.clip(ratio, 1 - eps, 1 + eps) * advantage
    return np.minimum(unclipped, clipped)

print("Reinforcement-learning helpers ready.")

Demo 1: Sequential decision making

Run this cell to connect the mathematical definition to a concrete numeric check.

Code cell 5

header("Demo 1: Sequential decision making - policy evaluation")
P, R, terminal = chain_mdp()
policy = np.array([[0.3, 0.7], [0.2, 0.8], [0.1, 0.9], [0.5, 0.5]])
V = policy_evaluation(P, R, policy, gamma=0.9)
print("Policy value:", np.round(V, 4).tolist())
check_true(V[2] > V[1] > V[0], "states closer to reward have higher value")

Demo 2: Rewards returns and delayed credit

Run this cell to connect the mathematical definition to a concrete numeric check.

Code cell 7

header("Demo 2: Rewards returns and delayed credit - value iteration")
P, R, terminal = chain_mdp()
V, policy, history = value_iteration(P, R, gamma=0.9, steps=20)
print("Optimal value:", np.round(V, 4).tolist())
print("Greedy actions:", policy.tolist())
print("Final Bellman change:", round(float(history[-1]), 8))
check_true(np.all(policy[:3] == 1), "optimal policy moves right toward terminal reward")
fig, ax = plt.subplots()
ax.plot(history, color=COLORS["primary"], label="Bellman change")
ax.set_title("Value iteration convergence")
ax.set_xlabel("Backup step")
ax.set_ylabel("Sup-norm change")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)

Demo 3: Exploration versus exploitation

Run this cell to connect the mathematical definition to a concrete numeric check.

Code cell 9

header("Demo 3: Exploration versus exploitation - Monte Carlo return")
rewards = [0, 0, 1]
returns = discounted_returns(rewards, gamma=0.9)
print("Rewards:", rewards)
print("Returns:", np.round(returns, 4).tolist())
check_close(returns[0], 0.81, name="delayed reward discounted twice")

Demo 4: RL versus supervised learning

Run this cell to connect the mathematical definition to a concrete numeric check.

Code cell 11

header("Demo 4: RL versus supervised learning - TD(0) update")
new_v, target = td_update(v_s=0.4, reward=0.0, v_next=0.8, alpha=0.5, gamma=0.9)
print("TD target:", round(target, 4))
print("Updated value:", round(new_v, 4))
check_close(new_v, 0.56, name="TD update moves halfway to target")

Demo 5: Where RL appears in LLM systems

Run this cell to connect the mathematical definition to a concrete numeric check.

Code cell 13

header("Demo 5: Where RL appears in LLM systems - SARSA versus Q-learning")
next_q = np.array([0.2, 0.9])
q_sarsa, target_sarsa = sarsa_update(0.1, reward=0.0, q_next_action=next_q[0], alpha=0.5, gamma=0.9)
q_ql, target_ql = q_learning_update(0.1, reward=0.0, next_q=next_q, alpha=0.5, gamma=0.9)
print("SARSA target:", round(target_sarsa, 4), "updated:", round(q_sarsa, 4))
print("Q-learning target:", round(target_ql, 4), "updated:", round(q_ql, 4))
check_true(q_ql > q_sarsa, "greedy off-policy target is larger here")

Demo 6: States actions rewards and transitions

Run this cell to connect the mathematical definition to a concrete numeric check.

Code cell 15

header("Demo 6: States actions rewards and transitions - epsilon greedy distribution")
q = np.array([1.0, 2.0, 0.5])
epsilon = 0.15
probs = np.ones_like(q) * epsilon / len(q)
probs[np.argmax(q)] += 1 - epsilon
print("Action probabilities:", np.round(probs, 4).tolist())
check_close(probs.sum(), 1.0, name="probabilities sum to one")
check_true(np.argmax(probs) == np.argmax(q), "greedy action remains most likely")

Demo 7: The Markov property

Run this cell to connect the mathematical definition to a concrete numeric check.

Code cell 17

header("Demo 7: The Markov property - policy gradient bandit")
logits = np.array([0.2, -0.1])
probs = softmax(logits)
action = 0
advantage = 1.5
grad_logp = np.array([1.0, 0.0]) - probs
grad = grad_logp * advantage
print("Policy probs:", np.round(probs, 4).tolist())
print("Gradient estimate:", np.round(grad, 4).tolist())
check_close(grad.sum(), 0.0, name="softmax score gradient sums to zero")

Demo 8: Episodic continuing finite and infinite horizon tasks

Run this cell to connect the mathematical definition to a concrete numeric check.

Code cell 19

header("Demo 8: Episodic continuing finite and infinite horizon tasks - generalized advantage estimation")
rewards = np.array([0.0, 0.2, 1.0])
values = np.array([0.3, 0.4, 0.7, 0.0])
adv = generalized_advantages(rewards, values, gamma=0.9, lam=0.8)
print("Advantages:", np.round(adv, 4).tolist())
check_true(adv[-1] > 0, "terminal reward creates positive final advantage")

Demo 9: Transition kernels and reward functions

Run this cell to connect the mathematical definition to a concrete numeric check.

Code cell 21

header("Demo 9: Transition kernels and reward functions - PPO clipping")
ratios = np.array([0.7, 0.9, 1.0, 1.2, 1.5])
adv = np.ones_like(ratios)
obj = ppo_clip_objective(ratios, adv, eps=0.2)
print("Ratios:", ratios.tolist())
print("Clipped objective:", np.round(obj, 4).tolist())
check_close(obj[-1], 1.2, name="positive advantage clips high ratio")
fig, ax = plt.subplots()
ax.plot(ratios, ratios * adv, color=COLORS["secondary"], label="unclipped")
ax.plot(ratios, obj, color=COLORS["primary"], label="clipped")
ax.set_title("PPO clipping for positive advantage")
ax.set_xlabel("Policy ratio")
ax.set_ylabel("Surrogate contribution")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)

Demo 10: Partial observability and belief states

Run this cell to connect the mathematical definition to a concrete numeric check.

Code cell 23

header("Demo 10: Partial observability and belief states - preference loss")
policy_gap = 0.8
reference_gap = 0.2
beta = 0.3
logit = beta * (policy_gap - reference_gap)
prob_preferred = 1 / (1 + np.exp(-logit))
loss = -np.log(prob_preferred)
print("Preference probability:", round(float(prob_preferred), 4))
print("DPO-style loss:", round(float(loss), 4))
check_true(0.0 < prob_preferred < 1.0, "preference probability is valid")

Demo 11: Discounted return

Run this cell to connect the mathematical definition to a concrete numeric check.

Code cell 25

header("Demo 11: Discounted return - policy evaluation")
P, R, terminal = chain_mdp()
policy = np.array([[0.3, 0.7], [0.2, 0.8], [0.1, 0.9], [0.5, 0.5]])
V = policy_evaluation(P, R, policy, gamma=0.9)
print("Policy value:", np.round(V, 4).tolist())
check_true(V[2] > V[1] > V[0], "states closer to reward have higher value")

Demo 12: Deterministic and stochastic policies

Run this cell to connect the mathematical definition to a concrete numeric check.

Code cell 27

header("Demo 12: Deterministic and stochastic policies - value iteration")
P, R, terminal = chain_mdp()
V, policy, history = value_iteration(P, R, gamma=0.9, steps=20)
print("Optimal value:", np.round(V, 4).tolist())
print("Greedy actions:", policy.tolist())
print("Final Bellman change:", round(float(history[-1]), 8))
check_true(np.all(policy[:3] == 1), "optimal policy moves right toward terminal reward")
fig, ax = plt.subplots()
ax.plot(history, color=COLORS["primary"], label="Bellman change")
ax.set_title("Value iteration convergence")
ax.set_xlabel("Backup step")
ax.set_ylabel("Sup-norm change")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)

Demo 13: State-value and action-value functions

Run this cell to connect the mathematical definition to a concrete numeric check.

Code cell 29

header("Demo 13: State-value and action-value functions - Monte Carlo return")
rewards = [0, 0, 1]
returns = discounted_returns(rewards, gamma=0.9)
print("Rewards:", rewards)
print("Returns:", np.round(returns, 4).tolist())
check_close(returns[0], 0.81, name="delayed reward discounted twice")

Demo 14: Advantage functions

Run this cell to connect the mathematical definition to a concrete numeric check.

Code cell 31

header("Demo 14: Advantage functions - TD(0) update")
new_v, target = td_update(v_s=0.4, reward=0.0, v_next=0.8, alpha=0.5, gamma=0.9)
print("TD target:", round(target, 4))
print("Updated value:", round(new_v, 4))
check_close(new_v, 0.56, name="TD update moves halfway to target")

Demo 15: Occupancy measures

Run this cell to connect the mathematical definition to a concrete numeric check.

Code cell 33

header("Demo 15: Occupancy measures - SARSA versus Q-learning")
next_q = np.array([0.2, 0.9])
q_sarsa, target_sarsa = sarsa_update(0.1, reward=0.0, q_next_action=next_q[0], alpha=0.5, gamma=0.9)
q_ql, target_ql = q_learning_update(0.1, reward=0.0, next_q=next_q, alpha=0.5, gamma=0.9)
print("SARSA target:", round(target_sarsa, 4), "updated:", round(q_sarsa, 4))
print("Q-learning target:", round(target_ql, 4), "updated:", round(q_ql, 4))
check_true(q_ql > q_sarsa, "greedy off-policy target is larger here")

Demo 16: Bellman expectation equation

Run this cell to connect the mathematical definition to a concrete numeric check.

Code cell 35

header("Demo 16: Bellman expectation equation - epsilon greedy distribution")
q = np.array([1.0, 2.0, 0.5])
epsilon = 0.15
probs = np.ones_like(q) * epsilon / len(q)
probs[np.argmax(q)] += 1 - epsilon
print("Action probabilities:", np.round(probs, 4).tolist())
check_close(probs.sum(), 1.0, name="probabilities sum to one")
check_true(np.argmax(probs) == np.argmax(q), "greedy action remains most likely")

Demo 17: Bellman optimality equation

Run this cell to connect the mathematical definition to a concrete numeric check.

Code cell 37

header("Demo 17: Bellman optimality equation - policy gradient bandit")
logits = np.array([0.2, -0.1])
probs = softmax(logits)
action = 0
advantage = 1.5
grad_logp = np.array([1.0, 0.0]) - probs
grad = grad_logp * advantage
print("Policy probs:", np.round(probs, 4).tolist())
print("Gradient estimate:", np.round(grad, 4).tolist())
check_close(grad.sum(), 0.0, name="softmax score gradient sums to zero")

Demo 18: Contraction mapping intuition

Run this cell to connect the mathematical definition to a concrete numeric check.

Code cell 39

header("Demo 18: Contraction mapping intuition - generalized advantage estimation")
rewards = np.array([0.0, 0.2, 1.0])
values = np.array([0.3, 0.4, 0.7, 0.0])
adv = generalized_advantages(rewards, values, gamma=0.9, lam=0.8)
print("Advantages:", np.round(adv, 4).tolist())
check_true(adv[-1] > 0, "terminal reward creates positive final advantage")

Demo 19: Matrix form for finite MDPs

Run this cell to connect the mathematical definition to a concrete numeric check.

Code cell 41

header("Demo 19: Matrix form for finite MDPs - PPO clipping")
ratios = np.array([0.7, 0.9, 1.0, 1.2, 1.5])
adv = np.ones_like(ratios)
obj = ppo_clip_objective(ratios, adv, eps=0.2)
print("Ratios:", ratios.tolist())
print("Clipped objective:", np.round(obj, 4).tolist())
check_close(obj[-1], 1.2, name="positive advantage clips high ratio")
fig, ax = plt.subplots()
ax.plot(ratios, ratios * adv, color=COLORS["secondary"], label="unclipped")
ax.plot(ratios, obj, color=COLORS["primary"], label="clipped")
ax.set_title("PPO clipping for positive advantage")
ax.set_xlabel("Policy ratio")
ax.set_ylabel("Surrogate contribution")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)

Demo 20: Bellman residuals

Run this cell to connect the mathematical definition to a concrete numeric check.

Code cell 43

header("Demo 20: Bellman residuals - preference loss")
policy_gap = 0.8
reference_gap = 0.2
beta = 0.3
logit = beta * (policy_gap - reference_gap)
prob_preferred = 1 / (1 + np.exp(-logit))
loss = -np.log(prob_preferred)
print("Preference probability:", round(float(prob_preferred), 4))
print("DPO-style loss:", round(float(loss), 4))
check_true(0.0 < prob_preferred < 1.0, "preference probability is valid")

Demo 21: Policy evaluation

Run this cell to connect the mathematical definition to a concrete numeric check.

Code cell 45

header("Demo 21: Policy evaluation - policy evaluation")
P, R, terminal = chain_mdp()
policy = np.array([[0.3, 0.7], [0.2, 0.8], [0.1, 0.9], [0.5, 0.5]])
V = policy_evaluation(P, R, policy, gamma=0.9)
print("Policy value:", np.round(V, 4).tolist())
check_true(V[2] > V[1] > V[0], "states closer to reward have higher value")

Demo 22: Policy improvement

Run this cell to connect the mathematical definition to a concrete numeric check.

Code cell 47

header("Demo 22: Policy improvement - value iteration")
P, R, terminal = chain_mdp()
V, policy, history = value_iteration(P, R, gamma=0.9, steps=20)
print("Optimal value:", np.round(V, 4).tolist())
print("Greedy actions:", policy.tolist())
print("Final Bellman change:", round(float(history[-1]), 8))
check_true(np.all(policy[:3] == 1), "optimal policy moves right toward terminal reward")
fig, ax = plt.subplots()
ax.plot(history, color=COLORS["primary"], label="Bellman change")
ax.set_title("Value iteration convergence")
ax.set_xlabel("Backup step")
ax.set_ylabel("Sup-norm change")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)

Demo 23: Policy iteration

Run this cell to connect the mathematical definition to a concrete numeric check.

Code cell 49

header("Demo 23: Policy iteration - Monte Carlo return")
rewards = [0, 0, 1]
returns = discounted_returns(rewards, gamma=0.9)
print("Rewards:", rewards)
print("Returns:", np.round(returns, 4).tolist())
check_close(returns[0], 0.81, name="delayed reward discounted twice")

Demo 24: Value iteration

Run this cell to connect the mathematical definition to a concrete numeric check.

Code cell 51

header("Demo 24: Value iteration - TD(0) update")
new_v, target = td_update(v_s=0.4, reward=0.0, v_next=0.8, alpha=0.5, gamma=0.9)
print("TD target:", round(target, 4))
print("Updated value:", round(new_v, 4))
check_close(new_v, 0.56, name="TD update moves halfway to target")

Demo 25: Planning versus learning

Run this cell to connect the mathematical definition to a concrete numeric check.

Code cell 53

header("Demo 25: Planning versus learning - SARSA versus Q-learning")
next_q = np.array([0.2, 0.9])
q_sarsa, target_sarsa = sarsa_update(0.1, reward=0.0, q_next_action=next_q[0], alpha=0.5, gamma=0.9)
q_ql, target_ql = q_learning_update(0.1, reward=0.0, next_q=next_q, alpha=0.5, gamma=0.9)
print("SARSA target:", round(target_sarsa, 4), "updated:", round(q_sarsa, 4))
print("Q-learning target:", round(target_ql, 4), "updated:", round(q_ql, 4))
check_true(q_ql > q_sarsa, "greedy off-policy target is larger here")

Skill Check

Test this lesson

Answer 4 quick questions to lock in the lesson and feed your adaptive practice queue.

--
Score
0/4
Answered
Not attempted
Status
1

Which module does this lesson belong to?

2

Which section is covered in this lesson content?

3

Which term is most central to this lesson?

4

What is the best way to use this lesson for real learning?

Your answers save locally first, then sync when account storage is available.
Practice queue