Theory Notebook
Converted from
theory.ipynbfor web reading.
Positional Encodings
This notebook is the executable companion to notes.md. It checks sinusoidal rows, learned table sizes, relative offsets, RoPE rotations, ALiBi bias, and decode position ids.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
def header(title):
print("\n" + "=" * 72)
print(title)
print("=" * 72)
def check_true(condition, name):
ok = bool(condition)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
assert ok, name
def check_close(value, target, tol=1e-8, name="value"):
value = float(value)
target = float(target)
ok = abs(value - target) <= tol
print(f"{'PASS' if ok else 'FAIL'} - {name}: got {value:.6f}, expected {target:.6f}")
assert ok, name
def sinusoidal_positions(n, d):
pos = np.arange(n)[:, None]
i = np.arange(d)[None, :]
rates = 1 / np.power(10000, (2 * (i // 2)) / d)
angles = pos * rates
pe = np.zeros((n, d))
pe[:, 0::2] = np.sin(angles[:, 0::2])
pe[:, 1::2] = np.cos(angles[:, 1::2])
return pe
def relative_offsets(T):
i = np.arange(T)[:, None]
j = np.arange(T)[None, :]
return i - j
def rope_rotate(x, position, base=10000.0):
x = np.asarray(x, dtype=float)
assert len(x) % 2 == 0
out = x.copy()
d = len(x)
for k in range(0, d, 2):
theta = position / (base ** (k / d))
c, s = np.cos(theta), np.sin(theta)
a, b = x[k], x[k + 1]
out[k] = c * a - s * b
out[k + 1] = s * a + c * b
return out
def alibi_bias(T, slope=-0.25):
return slope * np.maximum(relative_offsets(T), 0)
print("Positional-encoding helpers ready.")
Demo 1: Why attention needs position
This demo turns one position-encoding idea into a checked numeric example.
Code cell 5
header("Demo 1: Why attention needs position - sinusoidal table")
pe = sinusoidal_positions(6, 8)
print("Shape:", pe.shape)
print("Position 0:", np.round(pe[0], 3).tolist())
check_close(pe[0, 0], 0.0, name="sin position zero")
fig, ax = plt.subplots()
ax.plot(pe[:, 0], color=COLORS["primary"], label="dim 0")
ax.plot(pe[:, 1], color=COLORS["secondary"], label="dim 1")
ax.set_title("Sinusoidal positional features")
ax.set_xlabel("Position")
ax.set_ylabel("Value")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
Demo 2: Absolute versus relative position
This demo turns one position-encoding idea into a checked numeric example.
Code cell 7
header("Demo 2: Absolute versus relative position - token plus position")
x = np.array([1.0, 2.0, 3.0, 4.0])
p = sinusoidal_positions(1, 4)[0]
h = x + p
print("Position vector:", np.round(p, 3).tolist())
print("Hidden init:", np.round(h, 3).tolist())
check_true(h.shape == x.shape, "addition preserves model width")
Demo 3: Additive versus score-based position
This demo turns one position-encoding idea into a checked numeric example.
Code cell 9
header("Demo 3: Additive versus score-based position - relative offsets")
R = relative_offsets(4)
print(R)
check_true(R[3, 0] == 3 and R[0, 3] == -3, "offset matrix is query minus key")
Demo 4: Length extrapolation
This demo turns one position-encoding idea into a checked numeric example.
Code cell 11
header("Demo 4: Length extrapolation - relative bias")
bias = -0.1 * np.abs(relative_offsets(5))
print(np.round(bias, 2))
check_close(bias[0, 4], -0.4, name="distance four penalty")
Demo 5: Position in decoder-only LLMs
This demo turns one position-encoding idea into a checked numeric example.
Code cell 13
header("Demo 5: Position in decoder-only LLMs - RoPE norm")
x = np.array([1.0, 0.0, 0.5, -0.5])
y = rope_rotate(x, position=11)
print("Rotated:", np.round(y, 4).tolist())
check_close(np.linalg.norm(y), np.linalg.norm(x), name="RoPE preserves norm")
Demo 6: Position indices
This demo turns one position-encoding idea into a checked numeric example.
Code cell 15
header("Demo 6: Position indices - RoPE relative dot product")
q = np.array([1.0, 0.0])
k = np.array([0.5, 0.5])
i_pos, j_pos = 7, 4
left = np.dot(rope_rotate(q, i_pos), rope_rotate(k, j_pos))
right = np.dot(q, rope_rotate(k, j_pos - i_pos))
print("Left:", round(float(left), 6), "Right:", round(float(right), 6))
check_close(left, right, tol=1e-8, name="relative rotation identity")
Demo 7: Token plus position representation
This demo turns one position-encoding idea into a checked numeric example.
Code cell 17
header("Demo 7: Token plus position representation - ALiBi")
bias = alibi_bias(5, slope=-0.25)
print(np.round(bias, 2))
check_close(bias[4, 0], -1.0, name="linear past-distance penalty")
Demo 8: Attention score modification
This demo turns one position-encoding idea into a checked numeric example.
Code cell 19
header("Demo 8: Attention score modification - learned table params")
Tmax, d = 4096, 4096
params = Tmax * d
print("Position parameters:", params)
check_true(params == 16777216, "learned absolute table size")
Demo 9: Relative offset notation
This demo turns one position-encoding idea into a checked numeric example.
Code cell 21
header("Demo 9: Relative offset notation - decode position ids")
prefix = 12
generated = np.arange(5)
pos_ids = prefix + generated
print("Decode position ids:", pos_ids.tolist())
check_true(pos_ids[0] == prefix and np.all(np.diff(pos_ids) == 1), "decode positions advance by one")
Demo 10: Position interpolation and scaling
This demo turns one position-encoding idea into a checked numeric example.
Code cell 23
header("Demo 10: Position interpolation and scaling - length scaling")
train_len, target_len = 2048, 8192
scale = train_len / target_len
orig_pos = np.array([0, 2048, 4096, 8191])
scaled_pos = orig_pos * scale
print("Scaled positions:", np.round(scaled_pos, 2).tolist())
check_true(scaled_pos[-1] < train_len, "interpolation maps target positions into training range")
Demo 11: Frequency ladder
This demo turns one position-encoding idea into a checked numeric example.
Code cell 25
header("Demo 11: Frequency ladder - sinusoidal table")
pe = sinusoidal_positions(6, 8)
print("Shape:", pe.shape)
print("Position 0:", np.round(pe[0], 3).tolist())
check_close(pe[0, 0], 0.0, name="sin position zero")
fig, ax = plt.subplots()
ax.plot(pe[:, 0], color=COLORS["primary"], label="dim 0")
ax.plot(pe[:, 1], color=COLORS["secondary"], label="dim 1")
ax.set_title("Sinusoidal positional features")
ax.set_xlabel("Position")
ax.set_ylabel("Value")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
Demo 12: Sine cosine pairs
This demo turns one position-encoding idea into a checked numeric example.
Code cell 27
header("Demo 12: Sine cosine pairs - token plus position")
x = np.array([1.0, 2.0, 3.0, 4.0])
p = sinusoidal_positions(1, 4)[0]
h = x + p
print("Position vector:", np.round(p, 3).tolist())
print("Hidden init:", np.round(h, 3).tolist())
check_true(h.shape == x.shape, "addition preserves model width")
Demo 13: Linear relative-offset intuition
This demo turns one position-encoding idea into a checked numeric example.
Code cell 29
header("Demo 13: Linear relative-offset intuition - relative offsets")
R = relative_offsets(4)
print(R)
check_true(R[3, 0] == 3 and R[0, 3] == -3, "offset matrix is query minus key")
Demo 14: Visualization and aliasing
This demo turns one position-encoding idea into a checked numeric example.
Code cell 31
header("Demo 14: Visualization and aliasing - relative bias")
bias = -0.1 * np.abs(relative_offsets(5))
print(np.round(bias, 2))
check_close(bias[0, 4], -0.4, name="distance four penalty")
Demo 15: Limitations
This demo turns one position-encoding idea into a checked numeric example.
Code cell 33
header("Demo 15: Limitations - RoPE norm")
x = np.array([1.0, 0.0, 0.5, -0.5])
y = rope_rotate(x, position=11)
print("Rotated:", np.round(y, 4).tolist())
check_close(np.linalg.norm(y), np.linalg.norm(x), name="RoPE preserves norm")
Demo 16: Learned position table
This demo turns one position-encoding idea into a checked numeric example.
Code cell 35
header("Demo 16: Learned position table - RoPE relative dot product")
q = np.array([1.0, 0.0])
k = np.array([0.5, 0.5])
i_pos, j_pos = 7, 4
left = np.dot(rope_rotate(q, i_pos), rope_rotate(k, j_pos))
right = np.dot(q, rope_rotate(k, j_pos - i_pos))
print("Left:", round(float(left), 6), "Right:", round(float(right), 6))
check_close(left, right, tol=1e-8, name="relative rotation identity")
Demo 17: Training length limit
This demo turns one position-encoding idea into a checked numeric example.
Code cell 37
header("Demo 17: Training length limit - ALiBi")
bias = alibi_bias(5, slope=-0.25)
print(np.round(bias, 2))
check_close(bias[4, 0], -1.0, name="linear past-distance penalty")
Demo 18: Interpolation resizing
This demo turns one position-encoding idea into a checked numeric example.
Code cell 39
header("Demo 18: Interpolation resizing - learned table params")
Tmax, d = 4096, 4096
params = Tmax * d
print("Position parameters:", params)
check_true(params == 16777216, "learned absolute table size")
Demo 19: BERT GPT-style usage
This demo turns one position-encoding idea into a checked numeric example.
Code cell 41
header("Demo 19: BERT GPT-style usage - decode position ids")
prefix = 12
generated = np.arange(5)
pos_ids = prefix + generated
print("Decode position ids:", pos_ids.tolist())
check_true(pos_ids[0] == prefix and np.all(np.diff(pos_ids) == 1), "decode positions advance by one")
Demo 20: Failure modes
This demo turns one position-encoding idea into a checked numeric example.
Code cell 43
header("Demo 20: Failure modes - length scaling")
train_len, target_len = 2048, 8192
scale = train_len / target_len
orig_pos = np.array([0, 2048, 4096, 8191])
scaled_pos = orig_pos * scale
print("Scaled positions:", np.round(scaled_pos, 2).tolist())
check_true(scaled_pos[-1] < train_len, "interpolation maps target positions into training range")
Demo 21: Relative bias matrices
This demo turns one position-encoding idea into a checked numeric example.
Code cell 45
header("Demo 21: Relative bias matrices - sinusoidal table")
pe = sinusoidal_positions(6, 8)
print("Shape:", pe.shape)
print("Position 0:", np.round(pe[0], 3).tolist())
check_close(pe[0, 0], 0.0, name="sin position zero")
fig, ax = plt.subplots()
ax.plot(pe[:, 0], color=COLORS["primary"], label="dim 0")
ax.plot(pe[:, 1], color=COLORS["secondary"], label="dim 1")
ax.set_title("Sinusoidal positional features")
ax.set_xlabel("Position")
ax.set_ylabel("Value")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
Demo 22: Shaw-style relative keys
This demo turns one position-encoding idea into a checked numeric example.
Code cell 47
header("Demo 22: Shaw-style relative keys - token plus position")
x = np.array([1.0, 2.0, 3.0, 4.0])
p = sinusoidal_positions(1, 4)[0]
h = x + p
print("Position vector:", np.round(p, 3).tolist())
print("Hidden init:", np.round(h, 3).tolist())
check_true(h.shape == x.shape, "addition preserves model width")
Demo 23: Transformer-XL intuition
This demo turns one position-encoding idea into a checked numeric example.
Code cell 49
header("Demo 23: Transformer-XL intuition - relative offsets")
R = relative_offsets(4)
print(R)
check_true(R[3, 0] == 3 and R[0, 3] == -3, "offset matrix is query minus key")
Demo 24: Bucketed distances
This demo turns one position-encoding idea into a checked numeric example.
Code cell 51
header("Demo 24: Bucketed distances - relative bias")
bias = -0.1 * np.abs(relative_offsets(5))
print(np.round(bias, 2))
check_close(bias[0, 4], -0.4, name="distance four penalty")
Demo 25: When relative position helps
This demo turns one position-encoding idea into a checked numeric example.
Code cell 53
header("Demo 25: When relative position helps - RoPE norm")
x = np.array([1.0, 0.0, 0.5, -0.5])
y = rope_rotate(x, position=11)
print("Rotated:", np.round(y, 4).tolist())
check_close(np.linalg.norm(y), np.linalg.norm(x), name="RoPE preserves norm")