Theory Notebook
Converted from
theory.ipynbfor web reading.
Reinforcement Learning
This notebook is the executable companion to notes.md. It keeps the MDPs tiny so Bellman equations, TD targets, policy gradients, PPO clipping, and preference losses can be inspected numerically.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
def header(title):
print("\n" + "=" * 72)
print(title)
print("=" * 72)
def check_true(condition, name):
ok = bool(condition)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
assert ok, name
def check_close(value, target, tol=1e-8, name="value"):
value = float(value)
target = float(target)
ok = abs(value - target) <= tol
print(f"{'PASS' if ok else 'FAIL'} - {name}: got {value:.6f}, expected {target:.6f}")
assert ok, name
def softmax(logits):
logits = np.asarray(logits, dtype=float)
shifted = logits - np.max(logits)
exp = np.exp(shifted)
return exp / exp.sum()
def chain_mdp():
# Three nonterminal states plus one terminal state.
n_states, n_actions, terminal = 4, 2, 3
P = np.zeros((n_states, n_actions, n_states))
R = np.zeros((n_states, n_actions, n_states))
for s in range(n_states):
if s == terminal:
P[s, :, terminal] = 1.0
continue
left = max(0, s - 1)
right = min(terminal, s + 1)
P[s, 0, left] = 1.0
P[s, 1, right] = 1.0
if right == terminal:
R[s, 1, terminal] = 1.0
return P, R, terminal
def policy_transition_reward(P, R, policy):
P_pi = np.einsum("sa,san->sn", policy, P)
r_pi = np.einsum("sa,san,san->s", policy, P, R)
return P_pi, r_pi
def policy_evaluation(P, R, policy, gamma=0.9):
P_pi, r_pi = policy_transition_reward(P, R, policy)
A = np.eye(P.shape[0]) - gamma * P_pi
return np.linalg.solve(A, r_pi)
def bellman_optimality_backup(P, R, V, gamma=0.9):
q = np.einsum("san,san->sa", P, R + gamma * V[None, None, :])
return q.max(axis=1), q.argmax(axis=1), q
def value_iteration(P, R, gamma=0.9, steps=25):
V = np.zeros(P.shape[0])
history = []
for _ in range(steps):
new_V, policy, q = bellman_optimality_backup(P, R, V, gamma)
history.append(float(np.max(np.abs(new_V - V))))
V = new_V
return V, policy, np.array(history)
def td_update(v_s, reward, v_next, alpha=0.2, gamma=0.9, done=False):
target = reward if done else reward + gamma * v_next
return v_s + alpha * (target - v_s), target
def q_learning_update(q_sa, reward, next_q, alpha=0.3, gamma=0.9, done=False):
target = reward if done else reward + gamma * np.max(next_q)
return q_sa + alpha * (target - q_sa), target
def sarsa_update(q_sa, reward, q_next_action, alpha=0.3, gamma=0.9, done=False):
target = reward if done else reward + gamma * q_next_action
return q_sa + alpha * (target - q_sa), target
def discounted_returns(rewards, gamma=0.9):
out = []
g = 0.0
for r in reversed(rewards):
g = float(r) + gamma * g
out.append(g)
return np.array(list(reversed(out)))
def generalized_advantages(rewards, values, gamma=0.9, lam=0.95):
rewards = np.asarray(rewards, dtype=float)
values = np.asarray(values, dtype=float)
adv = np.zeros_like(rewards)
gae = 0.0
for t in reversed(range(len(rewards))):
delta = rewards[t] + gamma * values[t + 1] - values[t]
gae = delta + gamma * lam * gae
adv[t] = gae
return adv
def ppo_clip_objective(ratio, advantage, eps=0.2):
unclipped = ratio * advantage
clipped = np.clip(ratio, 1 - eps, 1 + eps) * advantage
return np.minimum(unclipped, clipped)
print("Reinforcement-learning helpers ready.")
Demo 1: Sequential decision making
Run this cell to connect the mathematical definition to a concrete numeric check.
Code cell 5
header("Demo 1: Sequential decision making - policy evaluation")
P, R, terminal = chain_mdp()
policy = np.array([[0.3, 0.7], [0.2, 0.8], [0.1, 0.9], [0.5, 0.5]])
V = policy_evaluation(P, R, policy, gamma=0.9)
print("Policy value:", np.round(V, 4).tolist())
check_true(V[2] > V[1] > V[0], "states closer to reward have higher value")
Demo 2: Rewards returns and delayed credit
Run this cell to connect the mathematical definition to a concrete numeric check.
Code cell 7
header("Demo 2: Rewards returns and delayed credit - value iteration")
P, R, terminal = chain_mdp()
V, policy, history = value_iteration(P, R, gamma=0.9, steps=20)
print("Optimal value:", np.round(V, 4).tolist())
print("Greedy actions:", policy.tolist())
print("Final Bellman change:", round(float(history[-1]), 8))
check_true(np.all(policy[:3] == 1), "optimal policy moves right toward terminal reward")
fig, ax = plt.subplots()
ax.plot(history, color=COLORS["primary"], label="Bellman change")
ax.set_title("Value iteration convergence")
ax.set_xlabel("Backup step")
ax.set_ylabel("Sup-norm change")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
Demo 3: Exploration versus exploitation
Run this cell to connect the mathematical definition to a concrete numeric check.
Code cell 9
header("Demo 3: Exploration versus exploitation - Monte Carlo return")
rewards = [0, 0, 1]
returns = discounted_returns(rewards, gamma=0.9)
print("Rewards:", rewards)
print("Returns:", np.round(returns, 4).tolist())
check_close(returns[0], 0.81, name="delayed reward discounted twice")
Demo 4: RL versus supervised learning
Run this cell to connect the mathematical definition to a concrete numeric check.
Code cell 11
header("Demo 4: RL versus supervised learning - TD(0) update")
new_v, target = td_update(v_s=0.4, reward=0.0, v_next=0.8, alpha=0.5, gamma=0.9)
print("TD target:", round(target, 4))
print("Updated value:", round(new_v, 4))
check_close(new_v, 0.56, name="TD update moves halfway to target")
Demo 5: Where RL appears in LLM systems
Run this cell to connect the mathematical definition to a concrete numeric check.
Code cell 13
header("Demo 5: Where RL appears in LLM systems - SARSA versus Q-learning")
next_q = np.array([0.2, 0.9])
q_sarsa, target_sarsa = sarsa_update(0.1, reward=0.0, q_next_action=next_q[0], alpha=0.5, gamma=0.9)
q_ql, target_ql = q_learning_update(0.1, reward=0.0, next_q=next_q, alpha=0.5, gamma=0.9)
print("SARSA target:", round(target_sarsa, 4), "updated:", round(q_sarsa, 4))
print("Q-learning target:", round(target_ql, 4), "updated:", round(q_ql, 4))
check_true(q_ql > q_sarsa, "greedy off-policy target is larger here")
Demo 6: States actions rewards and transitions
Run this cell to connect the mathematical definition to a concrete numeric check.
Code cell 15
header("Demo 6: States actions rewards and transitions - epsilon greedy distribution")
q = np.array([1.0, 2.0, 0.5])
epsilon = 0.15
probs = np.ones_like(q) * epsilon / len(q)
probs[np.argmax(q)] += 1 - epsilon
print("Action probabilities:", np.round(probs, 4).tolist())
check_close(probs.sum(), 1.0, name="probabilities sum to one")
check_true(np.argmax(probs) == np.argmax(q), "greedy action remains most likely")
Demo 7: The Markov property
Run this cell to connect the mathematical definition to a concrete numeric check.
Code cell 17
header("Demo 7: The Markov property - policy gradient bandit")
logits = np.array([0.2, -0.1])
probs = softmax(logits)
action = 0
advantage = 1.5
grad_logp = np.array([1.0, 0.0]) - probs
grad = grad_logp * advantage
print("Policy probs:", np.round(probs, 4).tolist())
print("Gradient estimate:", np.round(grad, 4).tolist())
check_close(grad.sum(), 0.0, name="softmax score gradient sums to zero")
Demo 8: Episodic continuing finite and infinite horizon tasks
Run this cell to connect the mathematical definition to a concrete numeric check.
Code cell 19
header("Demo 8: Episodic continuing finite and infinite horizon tasks - generalized advantage estimation")
rewards = np.array([0.0, 0.2, 1.0])
values = np.array([0.3, 0.4, 0.7, 0.0])
adv = generalized_advantages(rewards, values, gamma=0.9, lam=0.8)
print("Advantages:", np.round(adv, 4).tolist())
check_true(adv[-1] > 0, "terminal reward creates positive final advantage")
Demo 9: Transition kernels and reward functions
Run this cell to connect the mathematical definition to a concrete numeric check.
Code cell 21
header("Demo 9: Transition kernels and reward functions - PPO clipping")
ratios = np.array([0.7, 0.9, 1.0, 1.2, 1.5])
adv = np.ones_like(ratios)
obj = ppo_clip_objective(ratios, adv, eps=0.2)
print("Ratios:", ratios.tolist())
print("Clipped objective:", np.round(obj, 4).tolist())
check_close(obj[-1], 1.2, name="positive advantage clips high ratio")
fig, ax = plt.subplots()
ax.plot(ratios, ratios * adv, color=COLORS["secondary"], label="unclipped")
ax.plot(ratios, obj, color=COLORS["primary"], label="clipped")
ax.set_title("PPO clipping for positive advantage")
ax.set_xlabel("Policy ratio")
ax.set_ylabel("Surrogate contribution")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
Demo 10: Partial observability and belief states
Run this cell to connect the mathematical definition to a concrete numeric check.
Code cell 23
header("Demo 10: Partial observability and belief states - preference loss")
policy_gap = 0.8
reference_gap = 0.2
beta = 0.3
logit = beta * (policy_gap - reference_gap)
prob_preferred = 1 / (1 + np.exp(-logit))
loss = -np.log(prob_preferred)
print("Preference probability:", round(float(prob_preferred), 4))
print("DPO-style loss:", round(float(loss), 4))
check_true(0.0 < prob_preferred < 1.0, "preference probability is valid")
Demo 11: Discounted return
Run this cell to connect the mathematical definition to a concrete numeric check.
Code cell 25
header("Demo 11: Discounted return - policy evaluation")
P, R, terminal = chain_mdp()
policy = np.array([[0.3, 0.7], [0.2, 0.8], [0.1, 0.9], [0.5, 0.5]])
V = policy_evaluation(P, R, policy, gamma=0.9)
print("Policy value:", np.round(V, 4).tolist())
check_true(V[2] > V[1] > V[0], "states closer to reward have higher value")
Demo 12: Deterministic and stochastic policies
Run this cell to connect the mathematical definition to a concrete numeric check.
Code cell 27
header("Demo 12: Deterministic and stochastic policies - value iteration")
P, R, terminal = chain_mdp()
V, policy, history = value_iteration(P, R, gamma=0.9, steps=20)
print("Optimal value:", np.round(V, 4).tolist())
print("Greedy actions:", policy.tolist())
print("Final Bellman change:", round(float(history[-1]), 8))
check_true(np.all(policy[:3] == 1), "optimal policy moves right toward terminal reward")
fig, ax = plt.subplots()
ax.plot(history, color=COLORS["primary"], label="Bellman change")
ax.set_title("Value iteration convergence")
ax.set_xlabel("Backup step")
ax.set_ylabel("Sup-norm change")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
Demo 13: State-value and action-value functions
Run this cell to connect the mathematical definition to a concrete numeric check.
Code cell 29
header("Demo 13: State-value and action-value functions - Monte Carlo return")
rewards = [0, 0, 1]
returns = discounted_returns(rewards, gamma=0.9)
print("Rewards:", rewards)
print("Returns:", np.round(returns, 4).tolist())
check_close(returns[0], 0.81, name="delayed reward discounted twice")
Demo 14: Advantage functions
Run this cell to connect the mathematical definition to a concrete numeric check.
Code cell 31
header("Demo 14: Advantage functions - TD(0) update")
new_v, target = td_update(v_s=0.4, reward=0.0, v_next=0.8, alpha=0.5, gamma=0.9)
print("TD target:", round(target, 4))
print("Updated value:", round(new_v, 4))
check_close(new_v, 0.56, name="TD update moves halfway to target")
Demo 15: Occupancy measures
Run this cell to connect the mathematical definition to a concrete numeric check.
Code cell 33
header("Demo 15: Occupancy measures - SARSA versus Q-learning")
next_q = np.array([0.2, 0.9])
q_sarsa, target_sarsa = sarsa_update(0.1, reward=0.0, q_next_action=next_q[0], alpha=0.5, gamma=0.9)
q_ql, target_ql = q_learning_update(0.1, reward=0.0, next_q=next_q, alpha=0.5, gamma=0.9)
print("SARSA target:", round(target_sarsa, 4), "updated:", round(q_sarsa, 4))
print("Q-learning target:", round(target_ql, 4), "updated:", round(q_ql, 4))
check_true(q_ql > q_sarsa, "greedy off-policy target is larger here")
Demo 16: Bellman expectation equation
Run this cell to connect the mathematical definition to a concrete numeric check.
Code cell 35
header("Demo 16: Bellman expectation equation - epsilon greedy distribution")
q = np.array([1.0, 2.0, 0.5])
epsilon = 0.15
probs = np.ones_like(q) * epsilon / len(q)
probs[np.argmax(q)] += 1 - epsilon
print("Action probabilities:", np.round(probs, 4).tolist())
check_close(probs.sum(), 1.0, name="probabilities sum to one")
check_true(np.argmax(probs) == np.argmax(q), "greedy action remains most likely")
Demo 17: Bellman optimality equation
Run this cell to connect the mathematical definition to a concrete numeric check.
Code cell 37
header("Demo 17: Bellman optimality equation - policy gradient bandit")
logits = np.array([0.2, -0.1])
probs = softmax(logits)
action = 0
advantage = 1.5
grad_logp = np.array([1.0, 0.0]) - probs
grad = grad_logp * advantage
print("Policy probs:", np.round(probs, 4).tolist())
print("Gradient estimate:", np.round(grad, 4).tolist())
check_close(grad.sum(), 0.0, name="softmax score gradient sums to zero")
Demo 18: Contraction mapping intuition
Run this cell to connect the mathematical definition to a concrete numeric check.
Code cell 39
header("Demo 18: Contraction mapping intuition - generalized advantage estimation")
rewards = np.array([0.0, 0.2, 1.0])
values = np.array([0.3, 0.4, 0.7, 0.0])
adv = generalized_advantages(rewards, values, gamma=0.9, lam=0.8)
print("Advantages:", np.round(adv, 4).tolist())
check_true(adv[-1] > 0, "terminal reward creates positive final advantage")
Demo 19: Matrix form for finite MDPs
Run this cell to connect the mathematical definition to a concrete numeric check.
Code cell 41
header("Demo 19: Matrix form for finite MDPs - PPO clipping")
ratios = np.array([0.7, 0.9, 1.0, 1.2, 1.5])
adv = np.ones_like(ratios)
obj = ppo_clip_objective(ratios, adv, eps=0.2)
print("Ratios:", ratios.tolist())
print("Clipped objective:", np.round(obj, 4).tolist())
check_close(obj[-1], 1.2, name="positive advantage clips high ratio")
fig, ax = plt.subplots()
ax.plot(ratios, ratios * adv, color=COLORS["secondary"], label="unclipped")
ax.plot(ratios, obj, color=COLORS["primary"], label="clipped")
ax.set_title("PPO clipping for positive advantage")
ax.set_xlabel("Policy ratio")
ax.set_ylabel("Surrogate contribution")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
Demo 20: Bellman residuals
Run this cell to connect the mathematical definition to a concrete numeric check.
Code cell 43
header("Demo 20: Bellman residuals - preference loss")
policy_gap = 0.8
reference_gap = 0.2
beta = 0.3
logit = beta * (policy_gap - reference_gap)
prob_preferred = 1 / (1 + np.exp(-logit))
loss = -np.log(prob_preferred)
print("Preference probability:", round(float(prob_preferred), 4))
print("DPO-style loss:", round(float(loss), 4))
check_true(0.0 < prob_preferred < 1.0, "preference probability is valid")
Demo 21: Policy evaluation
Run this cell to connect the mathematical definition to a concrete numeric check.
Code cell 45
header("Demo 21: Policy evaluation - policy evaluation")
P, R, terminal = chain_mdp()
policy = np.array([[0.3, 0.7], [0.2, 0.8], [0.1, 0.9], [0.5, 0.5]])
V = policy_evaluation(P, R, policy, gamma=0.9)
print("Policy value:", np.round(V, 4).tolist())
check_true(V[2] > V[1] > V[0], "states closer to reward have higher value")
Demo 22: Policy improvement
Run this cell to connect the mathematical definition to a concrete numeric check.
Code cell 47
header("Demo 22: Policy improvement - value iteration")
P, R, terminal = chain_mdp()
V, policy, history = value_iteration(P, R, gamma=0.9, steps=20)
print("Optimal value:", np.round(V, 4).tolist())
print("Greedy actions:", policy.tolist())
print("Final Bellman change:", round(float(history[-1]), 8))
check_true(np.all(policy[:3] == 1), "optimal policy moves right toward terminal reward")
fig, ax = plt.subplots()
ax.plot(history, color=COLORS["primary"], label="Bellman change")
ax.set_title("Value iteration convergence")
ax.set_xlabel("Backup step")
ax.set_ylabel("Sup-norm change")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
Demo 23: Policy iteration
Run this cell to connect the mathematical definition to a concrete numeric check.
Code cell 49
header("Demo 23: Policy iteration - Monte Carlo return")
rewards = [0, 0, 1]
returns = discounted_returns(rewards, gamma=0.9)
print("Rewards:", rewards)
print("Returns:", np.round(returns, 4).tolist())
check_close(returns[0], 0.81, name="delayed reward discounted twice")
Demo 24: Value iteration
Run this cell to connect the mathematical definition to a concrete numeric check.
Code cell 51
header("Demo 24: Value iteration - TD(0) update")
new_v, target = td_update(v_s=0.4, reward=0.0, v_next=0.8, alpha=0.5, gamma=0.9)
print("TD target:", round(target, 4))
print("Updated value:", round(new_v, 4))
check_close(new_v, 0.56, name="TD update moves halfway to target")
Demo 25: Planning versus learning
Run this cell to connect the mathematical definition to a concrete numeric check.
Code cell 53
header("Demo 25: Planning versus learning - SARSA versus Q-learning")
next_q = np.array([0.2, 0.9])
q_sarsa, target_sarsa = sarsa_update(0.1, reward=0.0, q_next_action=next_q[0], alpha=0.5, gamma=0.9)
q_ql, target_ql = q_learning_update(0.1, reward=0.0, next_q=next_q, alpha=0.5, gamma=0.9)
print("SARSA target:", round(target_sarsa, 4), "updated:", round(q_sarsa, 4))
print("Q-learning target:", round(target_ql, 4), "updated:", round(q_ql, 4))
check_true(q_ql > q_sarsa, "greedy off-policy target is larger here")