Python Performance & Optimization

Overview

Writing efficient Python code requires understanding performance characteristics, profiling techniques, and optimization strategies. This module covers measuring, analyzing, and improving code performance.

1. Why Performance Matters

When to Optimize

1. Measure first - Don't guess where bottlenecks are
2. Optimize hot paths - Focus on code that runs frequently
3. Consider trade-offs - Readability vs. performance
4. Profile in production-like conditions

Performance Principles

1. Choose the right algorithm (O(n) vs O(n²))
2. Use appropriate data structures
3. Avoid unnecessary work
4. Cache expensive computations
5. Use built-in functions and libraries

2. Time Measurement

Using time module

import time

start = time.time()
# Code to measure
result = expensive_operation()
end = time.time()
print(f"Elapsed: {end - start:.4f} seconds")

# More precise with perf_counter
start = time.perf_counter()
# Code to measure
end = time.perf_counter()
print(f"Elapsed: {end - start:.6f} seconds")

# For CPU time only (excluding sleep)
start = time.process_time()
# Code to measure
end = time.process_time()

Using timeit

import timeit

# Time a simple expression
time_taken = timeit.timeit('"-".join(str(n) for n in range(100))', number=10000)
print(f"Time: {time_taken:.4f}s")

# Time a function
def my_function():
    return sum(range(1000))

time_taken = timeit.timeit(my_function, number=10000)
print(f"Time: {time_taken:.4f}s")

# Compare different approaches
setup = "data = list(range(1000))"
stmt1 = "sum(data)"
stmt2 = "total = 0\nfor x in data: total += x"

print(f"sum(): {timeit.timeit(stmt1, setup, number=10000):.4f}s")
print(f"loop: {timeit.timeit(stmt2, setup, number=10000):.4f}s")

Context Manager for Timing

import time
from contextlib import contextmanager

@contextmanager
def timer(description=""):
    start = time.perf_counter()
    yield
    elapsed = time.perf_counter() - start
    print(f"{description}: {elapsed:.4f}s")

# Usage
with timer("Processing data"):
    result = expensive_operation()

3. Profiling

cProfile - Built-in Profiler

import cProfile
import pstats

# Profile a function
def my_function():
    # Complex code here
    pass

cProfile.run('my_function()')

# Save profile to file
cProfile.run('my_function()', 'profile_output.prof')

# Analyze results
stats = pstats.Stats('profile_output.prof')
stats.strip_dirs()
stats.sort_stats('cumulative')
stats.print_stats(10)  # Top 10 functions

Profile Decorator

import cProfile
import functools

def profile(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        profiler = cProfile.Profile()
        try:
            return profiler.runcall(func, *args, **kwargs)
        finally:
            profiler.print_stats(sort='cumulative')
    return wrapper

@profile
def expensive_function():
    return sum(range(1000000))

line_profiler - Line-by-Line Profiling

pip install line_profiler

# my_script.py
@profile  # Decorator added by kernprof
def slow_function():
    total = 0
    for i in range(1000):
        total += i ** 2
    return total

# Run with: kernprof -l -v my_script.py

memory_profiler - Memory Usage

pip install memory-profiler

from memory_profiler import profile

@profile
def memory_heavy_function():
    big_list = [i ** 2 for i in range(1000000)]
    return sum(big_list)

4. Algorithm Complexity

Big O Notation

O(1)      - Constant: dict lookup, list index
O(log n)  - Logarithmic: binary search
O(n)      - Linear: list iteration
O(n log n)- Linearithmic: good sorting (merge, quick)
O(n²)     - Quadratic: nested loops
O(2^n)    - Exponential: recursive fibonacci

Common Operations Complexity

# Lists
list[i]          # O(1)
list.append(x)   # O(1) amortized
list.insert(0,x) # O(n)
x in list        # O(n)
list.sort()      # O(n log n)

# Dictionaries
dict[key]        # O(1) average
dict[key] = val  # O(1) average
key in dict      # O(1) average

# Sets
x in set         # O(1) average
set.add(x)       # O(1) average
set1 & set2      # O(min(len(s1), len(s2)))

5. Data Structure Optimization

List vs. Set for Membership

# Slow - O(n) for each lookup
def find_duplicates_list(data):
    seen = []
    duplicates = []
    for item in data:
        if item in seen:  # O(n)
            duplicates.append(item)
        seen.append(item)
    return duplicates

# Fast - O(1) for each lookup
def find_duplicates_set(data):
    seen = set()
    duplicates = []
    for item in data:
        if item in seen:  # O(1)
            duplicates.append(item)
        seen.add(item)
    return duplicates

collections.deque for Queues

from collections import deque

# Slow - O(n) for pop(0)
queue = []
queue.append(item)
item = queue.pop(0)  # O(n)

# Fast - O(1) for popleft
queue = deque()
queue.append(item)
item = queue.popleft()  # O(1)

Using slots for Memory

class Point:
    __slots__ = ['x', 'y']  # 40-50% less memory

    def __init__(self, x, y):
        self.x = x
        self.y = y

# vs. regular class (uses dict for attributes)
class PointNoSlots:
    def __init__(self, x, y):
        self.x = x
        self.y = y

6. Caching

functools.lru_cache

from functools import lru_cache

@lru_cache(maxsize=128)
def fibonacci(n):
    if n < 2:
        return n
    return fibonacci(n - 1) + fibonacci(n - 2)

# Check cache stats
print(fibonacci.cache_info())
# CacheInfo(hits=46, misses=51, maxsize=128, currsize=51)

# Clear cache
fibonacci.cache_clear()

functools.cache (Python 3.9+)

from functools import cache

@cache  # Unlimited cache
def expensive_computation(x):
    return x ** 100

Custom Caching

def memoize(func):
    cache = {}

    def wrapper(*args):
        if args not in cache:
            cache[args] = func(*args)
        return cache[args]

    wrapper.cache = cache
    wrapper.clear_cache = lambda: cache.clear()
    return wrapper

@memoize
def slow_function(x):
    time.sleep(1)
    return x * 2

Cache with TTL

import time
from functools import wraps

def timed_cache(seconds):
    def decorator(func):
        cache = {}

        @wraps(func)
        def wrapper(*args):
            now = time.time()
            if args in cache:
                result, timestamp = cache[args]
                if now - timestamp < seconds:
                    return result

            result = func(*args)
            cache[args] = (result, now)
            return result

        return wrapper
    return decorator

@timed_cache(60)  # Cache for 60 seconds
def fetch_data(url):
    return requests.get(url).json()

7. String Optimization

String Concatenation

# Slow - creates new string each iteration
result = ""
for s in strings:
    result += s  # O(n²) total

# Fast - join is optimized
result = "".join(strings)  # O(n) total

# For formatted strings, use f-strings
name = "John"
age = 30
s = f"{name} is {age}"  # Fastest
s = "{} is {}".format(name, age)  # Slower
s = "%s is %d" % (name, age)  # Slowest

String Interning

import sys

# Python automatically interns small strings
a = "hello"
b = "hello"
print(a is b)  # True (same object)

# Force interning for larger strings
a = sys.intern("a longer string that won't be auto-interned")
b = sys.intern("a longer string that won't be auto-interned")
print(a is b)  # True

8. Loop Optimization

List Comprehensions vs. Loops

# Slower
result = []
for x in range(1000):
    result.append(x ** 2)

# Faster - list comprehension
result = [x ** 2 for x in range(1000)]

# Even faster for simple operations - map
result = list(map(lambda x: x ** 2, range(1000)))

Generator Expressions for Memory

# Uses memory for all items
total = sum([x ** 2 for x in range(1000000)])

# Uses constant memory (generator)
total = sum(x ** 2 for x in range(1000000))

Avoid Function Calls in Loops

# Slow - method lookup each iteration
for item in items:
    result.append(item)

# Faster - local reference
append = result.append
for item in items:
    append(item)

Use Built-in Functions

# Slow
total = 0
for x in data:
    total += x

# Fast - C implementation
total = sum(data)

# Same for other built-ins
maximum = max(data)
minimum = min(data)
any_true = any(x > 0 for x in data)
all_true = all(x > 0 for x in data)

9. NumPy for Numerical Performance

Vectorized Operations

import numpy as np

# Slow - Python loop
data = list(range(1000000))
result = [x * 2 for x in data]

# Fast - NumPy vectorization
data = np.arange(1000000)
result = data * 2  # 10-100x faster

Avoiding Copies

import numpy as np

# Creates copy
arr = np.array([1, 2, 3, 4, 5])
view = arr[::2].copy()

# Creates view (shares memory)
view = arr[::2]  # Modifications affect original

# Use in-place operations
arr *= 2  # In-place
arr = arr * 2  # Creates copy

10. Concurrency for I/O-Bound Tasks

asyncio for I/O

import asyncio
import aiohttp

async def fetch(session, url):
    async with session.get(url) as response:
        return await response.text()

async def fetch_all(urls):
    async with aiohttp.ClientSession() as session:
        tasks = [fetch(session, url) for url in urls]
        return await asyncio.gather(*tasks)

# Fetch 100 URLs concurrently
urls = [f"http://example.com/page/{i}" for i in range(100)]
results = asyncio.run(fetch_all(urls))

Threading for I/O

from concurrent.futures import ThreadPoolExecutor
import requests

def fetch(url):
    return requests.get(url).text

urls = [f"http://example.com/page/{i}" for i in range(100)]

with ThreadPoolExecutor(max_workers=10) as executor:
    results = list(executor.map(fetch, urls))

Multiprocessing for CPU-Bound

from multiprocessing import Pool

def cpu_intensive(n):
    return sum(i ** 2 for i in range(n))

numbers = [1000000] * 8

with Pool(processes=4) as pool:
    results = pool.map(cpu_intensive, numbers)

11. Memory Optimization

Generators Instead of Lists

# Memory heavy - stores all items
def get_all_data():
    return [process(i) for i in range(1000000)]

# Memory efficient - yields one at a time
def get_all_data_generator():
    for i in range(1000000):
        yield process(i)

Delete Unused Objects

# Free memory explicitly
big_data = load_huge_file()
process(big_data)
del big_data

# Or use context managers
with open("huge_file.txt") as f:
    process(f.read())
# File closed, memory can be freed

Check Memory Usage

import sys

# Size of object
x = [1, 2, 3, 4, 5]
print(sys.getsizeof(x))  # 120 bytes

# Deep size (including referenced objects)
def deep_getsizeof(obj, seen=None):
    size = sys.getsizeof(obj)
    if seen is None:
        seen = set()
    obj_id = id(obj)
    if obj_id in seen:
        return 0
    seen.add(obj_id)

    if isinstance(obj, dict):
        size += sum([deep_getsizeof(v, seen) for v in obj.values()])
        size += sum([deep_getsizeof(k, seen) for k in obj.keys()])
    elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes)):
        size += sum([deep_getsizeof(i, seen) for i in obj])

    return size

12. Best Practices

1. Profile Before Optimizing

# Don't guess - measure!
import cProfile
cProfile.run('main()', sort='cumulative')

2. Use Appropriate Data Structures

# For membership testing: set > list
# For ordered unique items: dict (Python 3.7+)
# For counting: Counter
# For FIFO queue: deque

3. Avoid Premature Optimization

# First: Make it work
# Second: Make it right (clean, tested)
# Third: Make it fast (if needed)

4. Use Built-in Functions

# Built-ins are implemented in C
sum(), max(), min(), sorted(), any(), all()

5. Consider Using Cython or PyPy

# For CPU-bound code that can't use numpy
# Cython: Write Python-like code, compile to C
# PyPy: Drop-in Python replacement with JIT

Summary

Technique	When to Use
timeit	Quick timing comparisons
cProfile	Find slow functions
line_profiler	Line-by-line analysis
lru_cache	Memoize pure functions
generators	Large data sequences
numpy	Numerical computations
asyncio	I/O-bound tasks
multiprocessing	CPU-bound tasks

Next Steps

After mastering performance:

•Learn NumPy for numerical computing
•Explore Cython for C-like performance
•Study distributed computing with Dask
•Learn about PyPy for JIT compilation

README