Exercises Notebook
Converted from
exercises.ipynbfor web reading.
Exercises: Tokenization Math
There are 10 exercises. Exercises 1-3 cover BPE mechanics, 4-7 cover information and cost, and 8-10 cover system diagnostics.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
def header(title):
print("\n" + "=" * 72)
print(title)
print("=" * 72)
def check_true(condition, name):
ok = bool(condition)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
assert ok, name
def check_close(value, target, tol=1e-8, name="value"):
value = float(value)
target = float(target)
ok = abs(value - target) <= tol
print(f"{'PASS' if ok else 'FAIL'} - {name}: got {value:.6f}, expected {target:.6f}")
assert ok, name
def char_tokens(text):
return tuple(text)
def pair_counts(corpus):
counts = {}
for word, freq in corpus.items():
pieces = list(word)
for a, b in zip(pieces, pieces[1:]):
counts[(a, b)] = counts.get((a, b), 0) + freq
return counts
def merge_word(word, pair):
out = []
i = 0
while i < len(word):
if i < len(word) - 1 and (word[i], word[i + 1]) == pair:
out.append(word[i] + word[i + 1])
i += 2
else:
out.append(word[i])
i += 1
return tuple(out)
def bpe_train(words, num_merges=3):
corpus = {tuple(w): f for w, f in words.items()}
merges = []
for _ in range(num_merges):
counts = {}
for pieces, freq in corpus.items():
for a, b in zip(pieces, pieces[1:]):
counts[(a, b)] = counts.get((a, b), 0) + freq
if not counts:
break
pair = max(counts, key=counts.get)
merges.append(pair)
corpus = {merge_word(pieces, pair): freq for pieces, freq in corpus.items()}
return merges, corpus
def bpe_encode(text, merges):
pieces = tuple(text)
for pair in merges:
pieces = merge_word(pieces, pair)
return list(pieces)
def entropy_from_counts(counts):
vals = np.array(list(counts.values()), dtype=float)
probs = vals / vals.sum()
return float(-(probs * np.log2(probs)).sum())
def viterbi_segment(text, token_logprobs):
n = len(text)
dp = [-1e18] * (n + 1)
back = [None] * (n + 1)
dp[0] = 0.0
for i in range(n):
if dp[i] < -1e17:
continue
for tok, logp in token_logprobs.items():
if text.startswith(tok, i):
j = i + len(tok)
score = dp[i] + logp
if score > dp[j]:
dp[j] = score
back[j] = (i, tok)
if back[n] is None:
return [], -np.inf
out = []
pos = n
while pos > 0:
prev, tok = back[pos]
out.append(tok)
pos = prev
return list(reversed(out)), dp[n]
def fertility(text, tokens):
words = max(1, len(text.split()))
return len(tokens) / words
print("Tokenization helpers ready.")
Exercise 1: BPE merge (*)
Learn one frequent-pair merge from a tiny corpus. Compute the answer and explain the LLM consequence.
Code cell 5
# Your Solution - Exercise 1
answer = None
print("Your answer placeholder:", answer)
Code cell 6
# Solution - Exercise 1
header("Exercise 1: BPE merge")
words = {"aa": 3, "ab": 2}
merges, corpus = bpe_train(words, num_merges=1)
print("Merge:", merges[0])
check_true(merges[0] == ("a", "a"), "most frequent pair merges first")
print("\nTakeaway: tokenization decisions change ids, lengths, costs, and model behavior before attention ever runs.")
Exercise 2: BPE encode (*)
Apply a learned merge table to a new string. Compute the answer and explain the LLM consequence.
Code cell 8
# Your Solution - Exercise 2
answer = None
print("Your answer placeholder:", answer)
Code cell 9
# Solution - Exercise 2
header("Exercise 2: BPE encode")
tokens = bpe_encode("lowest", [("l", "o"), ("lo", "w"), ("e", "s")])
print("Tokens:", tokens)
check_true("low" in tokens, "merge table applies in order")
print("\nTakeaway: tokenization decisions change ids, lengths, costs, and model behavior before attention ever runs.")
Exercise 3: Compression ratio (*)
Compute characters per token. Compute the answer and explain the LLM consequence.
Code cell 11
# Your Solution - Exercise 3
answer = None
print("Your answer placeholder:", answer)
Code cell 12
# Solution - Exercise 3
header("Exercise 3: Compression ratio")
text = "abcabc"
tokens = ["abc", "abc"]
ratio = len(text) / len(tokens)
print("chars/token:", ratio)
check_close(ratio, 3.0, name="compression ratio")
print("\nTakeaway: tokenization decisions change ids, lengths, costs, and model behavior before attention ever runs.")
Exercise 4: Token entropy (**)
Measure distribution balance from token counts. Compute the answer and explain the LLM consequence.
Code cell 14
# Your Solution - Exercise 4
answer = None
print("Your answer placeholder:", answer)
Code cell 15
# Solution - Exercise 4
header("Exercise 4: Token entropy")
H = entropy_from_counts({"a": 2, "b": 2, "c": 2, "d": 2})
print("Entropy:", H)
check_close(H, 2.0, name="uniform four-token entropy")
print("\nTakeaway: tokenization decisions change ids, lengths, costs, and model behavior before attention ever runs.")
Exercise 5: Viterbi segmentation (**)
Find the best unigram token path. Compute the answer and explain the LLM consequence.
Code cell 17
# Your Solution - Exercise 5
answer = None
print("Your answer placeholder:", answer)
Code cell 18
# Solution - Exercise 5
header("Exercise 5: Viterbi segmentation")
seg, score = viterbi_segment("aaaa", {"a": -2.0, "aa": -0.4})
print("Segmentation:", seg, "score:", score)
check_true(seg == ["aa", "aa"], "best path uses likely bigram pieces")
print("\nTakeaway: tokenization decisions change ids, lengths, costs, and model behavior before attention ever runs.")
Exercise 6: Vocabulary parameter cost (**)
Compute embedding table size. Compute the answer and explain the LLM consequence.
Code cell 20
# Your Solution - Exercise 6
answer = None
print("Your answer placeholder:", answer)
Code cell 21
# Solution - Exercise 6
header("Exercise 6: Vocabulary parameter cost")
params = 50000 * 2048
print("Embedding params:", params)
check_true(params == 102400000, "vocab times width")
print("\nTakeaway: tokenization decisions change ids, lengths, costs, and model behavior before attention ever runs.")
Exercise 7: Attention cost (**)
Compare quadratic sequence-length costs. Compute the answer and explain the LLM consequence.
Code cell 23
# Your Solution - Exercise 7
answer = None
print("Your answer placeholder:", answer)
Code cell 24
# Solution - Exercise 7
header("Exercise 7: Attention cost")
ratio = (2048 / 1024) ** 2
print("Attention cost ratio:", ratio)
check_close(ratio, 4.0, name="doubling tokens quadruples attention cost")
print("\nTakeaway: tokenization decisions change ids, lengths, costs, and model behavior before attention ever runs.")
Exercise 8: Fertility (***)
Compare token splits per word. Compute the answer and explain the LLM consequence.
Code cell 26
# Your Solution - Exercise 8
answer = None
print("Your answer placeholder:", answer)
Code cell 27
# Solution - Exercise 8
header("Exercise 8: Fertility")
f1 = fertility("hello world", ["hello", " world"])
f2 = fertility("hello world", ["he", "llo", " world"])
print("Fertilities:", f1, f2)
check_true(f2 > f1, "extra splits increase fertility")
print("\nTakeaway: tokenization decisions change ids, lengths, costs, and model behavior before attention ever runs.")
Exercise 9: Round trip (***)
Check byte-level reversibility. Compute the answer and explain the LLM consequence.
Code cell 29
# Your Solution - Exercise 9
answer = None
print("Your answer placeholder:", answer)
Code cell 30
# Solution - Exercise 9
header("Exercise 9: Round trip")
text = "\n x=1"
ids = list(text.encode("utf-8"))
decoded = bytes(ids).decode("utf-8")
print("Decoded:", repr(decoded))
check_true(decoded == text, "byte ids preserve whitespace")
print("\nTakeaway: tokenization decisions change ids, lengths, costs, and model behavior before attention ever runs.")
Exercise 10: Special tokens (***)
Explain why a control delimiter must be atomic. Compute the answer and explain the LLM consequence.
Code cell 32
# Your Solution - Exercise 10
answer = None
print("Your answer placeholder:", answer)
Code cell 33
# Solution - Exercise 10
header("Exercise 10: Special tokens")
ordinary = ["<", "assistant", ">"]
special = ["<assistant>"]
print("Ordinary:", ordinary, "Special:", special)
check_true(len(special) == 1 and len(ordinary) == 3, "special delimiter is atomic")
print("\nTakeaway: tokenization decisions change ids, lengths, costs, and model behavior before attention ever runs.")