Math for LLMs

Exercises Notebook

Converted from exercises.ipynb for web reading.

Exercises: Tokenization Math

There are 10 exercises. Exercises 1-3 cover BPE mechanics, 4-7 cover information and cost, and 8-10 cover system diagnostics.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3


COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

def header(title):
    print("\n" + "=" * 72)
    print(title)
    print("=" * 72)

def check_true(condition, name):
    ok = bool(condition)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    assert ok, name

def check_close(value, target, tol=1e-8, name="value"):
    value = float(value)
    target = float(target)
    ok = abs(value - target) <= tol
    print(f"{'PASS' if ok else 'FAIL'} - {name}: got {value:.6f}, expected {target:.6f}")
    assert ok, name

def char_tokens(text):
    return tuple(text)

def pair_counts(corpus):
    counts = {}
    for word, freq in corpus.items():
        pieces = list(word)
        for a, b in zip(pieces, pieces[1:]):
            counts[(a, b)] = counts.get((a, b), 0) + freq
    return counts

def merge_word(word, pair):
    out = []
    i = 0
    while i < len(word):
        if i < len(word) - 1 and (word[i], word[i + 1]) == pair:
            out.append(word[i] + word[i + 1])
            i += 2
        else:
            out.append(word[i])
            i += 1
    return tuple(out)

def bpe_train(words, num_merges=3):
    corpus = {tuple(w): f for w, f in words.items()}
    merges = []
    for _ in range(num_merges):
        counts = {}
        for pieces, freq in corpus.items():
            for a, b in zip(pieces, pieces[1:]):
                counts[(a, b)] = counts.get((a, b), 0) + freq
        if not counts:
            break
        pair = max(counts, key=counts.get)
        merges.append(pair)
        corpus = {merge_word(pieces, pair): freq for pieces, freq in corpus.items()}
    return merges, corpus

def bpe_encode(text, merges):
    pieces = tuple(text)
    for pair in merges:
        pieces = merge_word(pieces, pair)
    return list(pieces)

def entropy_from_counts(counts):
    vals = np.array(list(counts.values()), dtype=float)
    probs = vals / vals.sum()
    return float(-(probs * np.log2(probs)).sum())

def viterbi_segment(text, token_logprobs):
    n = len(text)
    dp = [-1e18] * (n + 1)
    back = [None] * (n + 1)
    dp[0] = 0.0
    for i in range(n):
        if dp[i] < -1e17:
            continue
        for tok, logp in token_logprobs.items():
            if text.startswith(tok, i):
                j = i + len(tok)
                score = dp[i] + logp
                if score > dp[j]:
                    dp[j] = score
                    back[j] = (i, tok)
    if back[n] is None:
        return [], -np.inf
    out = []
    pos = n
    while pos > 0:
        prev, tok = back[pos]
        out.append(tok)
        pos = prev
    return list(reversed(out)), dp[n]

def fertility(text, tokens):
    words = max(1, len(text.split()))
    return len(tokens) / words

print("Tokenization helpers ready.")

Exercise 1: BPE merge (*)

Learn one frequent-pair merge from a tiny corpus. Compute the answer and explain the LLM consequence.

Code cell 5

# Your Solution - Exercise 1
answer = None
print("Your answer placeholder:", answer)

Code cell 6

# Solution - Exercise 1
header("Exercise 1: BPE merge")
words = {"aa": 3, "ab": 2}
merges, corpus = bpe_train(words, num_merges=1)
print("Merge:", merges[0])
check_true(merges[0] == ("a", "a"), "most frequent pair merges first")
print("\nTakeaway: tokenization decisions change ids, lengths, costs, and model behavior before attention ever runs.")

Exercise 2: BPE encode (*)

Apply a learned merge table to a new string. Compute the answer and explain the LLM consequence.

Code cell 8

# Your Solution - Exercise 2
answer = None
print("Your answer placeholder:", answer)

Code cell 9

# Solution - Exercise 2
header("Exercise 2: BPE encode")
tokens = bpe_encode("lowest", [("l", "o"), ("lo", "w"), ("e", "s")])
print("Tokens:", tokens)
check_true("low" in tokens, "merge table applies in order")
print("\nTakeaway: tokenization decisions change ids, lengths, costs, and model behavior before attention ever runs.")

Exercise 3: Compression ratio (*)

Compute characters per token. Compute the answer and explain the LLM consequence.

Code cell 11

# Your Solution - Exercise 3
answer = None
print("Your answer placeholder:", answer)

Code cell 12

# Solution - Exercise 3
header("Exercise 3: Compression ratio")
text = "abcabc"
tokens = ["abc", "abc"]
ratio = len(text) / len(tokens)
print("chars/token:", ratio)
check_close(ratio, 3.0, name="compression ratio")
print("\nTakeaway: tokenization decisions change ids, lengths, costs, and model behavior before attention ever runs.")

Exercise 4: Token entropy (**)

Measure distribution balance from token counts. Compute the answer and explain the LLM consequence.

Code cell 14

# Your Solution - Exercise 4
answer = None
print("Your answer placeholder:", answer)

Code cell 15

# Solution - Exercise 4
header("Exercise 4: Token entropy")
H = entropy_from_counts({"a": 2, "b": 2, "c": 2, "d": 2})
print("Entropy:", H)
check_close(H, 2.0, name="uniform four-token entropy")
print("\nTakeaway: tokenization decisions change ids, lengths, costs, and model behavior before attention ever runs.")

Exercise 5: Viterbi segmentation (**)

Find the best unigram token path. Compute the answer and explain the LLM consequence.

Code cell 17

# Your Solution - Exercise 5
answer = None
print("Your answer placeholder:", answer)

Code cell 18

# Solution - Exercise 5
header("Exercise 5: Viterbi segmentation")
seg, score = viterbi_segment("aaaa", {"a": -2.0, "aa": -0.4})
print("Segmentation:", seg, "score:", score)
check_true(seg == ["aa", "aa"], "best path uses likely bigram pieces")
print("\nTakeaway: tokenization decisions change ids, lengths, costs, and model behavior before attention ever runs.")

Exercise 6: Vocabulary parameter cost (**)

Compute embedding table size. Compute the answer and explain the LLM consequence.

Code cell 20

# Your Solution - Exercise 6
answer = None
print("Your answer placeholder:", answer)

Code cell 21

# Solution - Exercise 6
header("Exercise 6: Vocabulary parameter cost")
params = 50000 * 2048
print("Embedding params:", params)
check_true(params == 102400000, "vocab times width")
print("\nTakeaway: tokenization decisions change ids, lengths, costs, and model behavior before attention ever runs.")

Exercise 7: Attention cost (**)

Compare quadratic sequence-length costs. Compute the answer and explain the LLM consequence.

Code cell 23

# Your Solution - Exercise 7
answer = None
print("Your answer placeholder:", answer)

Code cell 24

# Solution - Exercise 7
header("Exercise 7: Attention cost")
ratio = (2048 / 1024) ** 2
print("Attention cost ratio:", ratio)
check_close(ratio, 4.0, name="doubling tokens quadruples attention cost")
print("\nTakeaway: tokenization decisions change ids, lengths, costs, and model behavior before attention ever runs.")

Exercise 8: Fertility (***)

Compare token splits per word. Compute the answer and explain the LLM consequence.

Code cell 26

# Your Solution - Exercise 8
answer = None
print("Your answer placeholder:", answer)

Code cell 27

# Solution - Exercise 8
header("Exercise 8: Fertility")
f1 = fertility("hello world", ["hello", " world"])
f2 = fertility("hello world", ["he", "llo", " world"])
print("Fertilities:", f1, f2)
check_true(f2 > f1, "extra splits increase fertility")
print("\nTakeaway: tokenization decisions change ids, lengths, costs, and model behavior before attention ever runs.")

Exercise 9: Round trip (***)

Check byte-level reversibility. Compute the answer and explain the LLM consequence.

Code cell 29

# Your Solution - Exercise 9
answer = None
print("Your answer placeholder:", answer)

Code cell 30

# Solution - Exercise 9
header("Exercise 9: Round trip")
text = "\n  x=1"
ids = list(text.encode("utf-8"))
decoded = bytes(ids).decode("utf-8")
print("Decoded:", repr(decoded))
check_true(decoded == text, "byte ids preserve whitespace")
print("\nTakeaway: tokenization decisions change ids, lengths, costs, and model behavior before attention ever runs.")

Exercise 10: Special tokens (***)

Explain why a control delimiter must be atomic. Compute the answer and explain the LLM consequence.

Code cell 32

# Your Solution - Exercise 10
answer = None
print("Your answer placeholder:", answer)

Code cell 33

# Solution - Exercise 10
header("Exercise 10: Special tokens")
ordinary = ["<", "assistant", ">"]
special = ["<assistant>"]
print("Ordinary:", ordinary, "Special:", special)
check_true(len(special) == 1 and len(ordinary) == 3, "special delimiter is atomic")
print("\nTakeaway: tokenization decisions change ids, lengths, costs, and model behavior before attention ever runs.")