Docs

README

13 - Regular Expressions (Regex)

📌 What You'll Learn

  • Pattern matching basics
  • Metacharacters and special sequences
  • Character classes
  • Quantifiers
  • Groups and capturing
  • Common patterns
  • re module functions

🔍 What is Regex?

Regular Expressions are patterns used to match character combinations in strings. Python's re module provides regex support.

import re

# Basic match
text = "Hello, World!"
if re.search(r"World", text):
    print("Found!")

📝 Basic Functions

re.search() - Find Pattern

import re

text = "The rain in Spain"

# Returns Match object or None
match = re.search(r"rain", text)
if match:
    print(f"Found: {match.group()}")
    print(f"Position: {match.start()}-{match.end()}")

re.match() - Match at Start

# Only matches at the beginning
match = re.match(r"The", text)     # Match!
match = re.match(r"rain", text)    # None (not at start)

re.findall() - Find All Matches

text = "cat bat rat sat"
matches = re.findall(r".at", text)
print(matches)  # ['cat', 'bat', 'rat', 'sat']

re.finditer() - Iterator of Matches

for match in re.finditer(r".at", text):
    print(f"{match.group()} at {match.start()}")

re.sub() - Replace Pattern

text = "Hello, World!"
result = re.sub(r"World", "Python", text)
print(result)  # Hello, Python!

# Replace multiple
text = "one two three"
result = re.sub(r"\s", "-", text)
print(result)  # one-two-three

re.split() - Split by Pattern

text = "one, two; three four"
parts = re.split(r"[,;\s]+", text)
print(parts)  # ['one', 'two', 'three', 'four']

🔤 Metacharacters

| Character | Meaning | | --------- | ------------------------------ | ---------------- | | . | Any character (except newline) | | ^ | Start of string | | $ | End of string | | * | 0 or more repetitions | | + | 1 or more repetitions | | ? | 0 or 1 repetition | | \ | Escape special character | | | | OR (alternation) | | () | Group | | [] | Character class | | {} | Specific repetitions |

import re

# . matches any character
re.findall(r"c.t", "cat cot cut")  # ['cat', 'cot', 'cut']

# ^ and $ anchors
re.search(r"^Hello", "Hello World")  # Matches
re.search(r"World$", "Hello World")  # Matches

# | alternation
re.findall(r"cat|dog", "I have a cat and a dog")  # ['cat', 'dog']

📊 Character Classes

ClassMeaning
[abc]a, b, or c
[^abc]NOT a, b, or c
[a-z]a through z
[A-Z]A through Z
[0-9]0 through 9
[a-zA-Z0-9]Alphanumeric
# Character class examples
re.findall(r"[aeiou]", "hello")  # ['e', 'o']
re.findall(r"[^aeiou]", "hello") # ['h', 'l', 'l']
re.findall(r"[0-9]+", "abc123xyz456")  # ['123', '456']

🔢 Special Sequences

SequenceMeaning
\dDigit [0-9]
\DNon-digit
\wWord char [a-zA-Z0-9_]
\WNon-word char
\sWhitespace
\SNon-whitespace
\bWord boundary
\BNon-word boundary
text = "My phone is 123-456-7890"

re.findall(r"\d+", text)  # ['123', '456', '7890']
re.findall(r"\w+", text)  # ['My', 'phone', 'is', '123', '456', '7890']

# Word boundaries
re.findall(r"\bcat\b", "cat category scat")  # ['cat']

🔁 Quantifiers

QuantifierMeaning
*0 or more
+1 or more
?0 or 1
{n}Exactly n
{n,}n or more
{n,m}Between n and m
text = "abbb ac abc abbc"

re.findall(r"ab*", text)    # ['abbb', 'a', 'ab', 'abb']
re.findall(r"ab+", text)    # ['abbb', 'ab', 'abb']
re.findall(r"ab?", text)    # ['ab', 'a', 'ab', 'ab']
re.findall(r"ab{2}", text)  # ['abb', 'abb']
re.findall(r"ab{1,2}", text)  # ['ab', 'ab', 'abb']

Greedy vs Non-Greedy

text = "<div>content</div>"

# Greedy (default) - matches as much as possible
re.findall(r"<.*>", text)   # ['<div>content</div>']

# Non-greedy (?) - matches as little as possible
re.findall(r"<.*?>", text)  # ['<div>', '</div>']

👥 Groups and Capturing

Basic Groups

text = "John Smith, Jane Doe"

# Parentheses create groups
matches = re.findall(r"(\w+) (\w+)", text)
print(matches)  # [('John', 'Smith'), ('Jane', 'Doe')]

# Access groups
match = re.search(r"(\w+) (\w+)", text)
print(match.group(0))  # John Smith (full match)
print(match.group(1))  # John
print(match.group(2))  # Smith
print(match.groups())  # ('John', 'Smith')

Named Groups

pattern = r"(?P<first>\w+) (?P<last>\w+)"
match = re.search(pattern, "John Smith")

print(match.group('first'))  # John
print(match.group('last'))   # Smith
print(match.groupdict())     # {'first': 'John', 'last': 'Smith'}

Non-Capturing Groups

# (?:...) groups but doesn't capture
pattern = r"(?:Mr|Mrs|Ms)\. (\w+)"
match = re.search(pattern, "Mr. Smith")
print(match.group(1))  # Smith (not Mr)

Backreferences

# \1, \2, etc. refer to captured groups
pattern = r"(\w+) \1"  # Repeated word
text = "the the cat sat sat"
matches = re.findall(pattern, text)
print(matches)  # ['the', 'sat']

🚩 Flags

import re

text = "Hello\nworld"

# IGNORECASE (re.I)
re.findall(r"hello", "Hello HELLO", re.IGNORECASE)

# MULTILINE (re.M) - ^ and $ match line start/end
re.findall(r"^\w+", text, re.MULTILINE)  # ['Hello', 'world']

# DOTALL (re.S) - . matches newline too
re.findall(r"Hello.world", text, re.DOTALL)  # ['Hello\nworld']

# VERBOSE (re.X) - allows comments in pattern
pattern = re.compile(r"""
    \d{3}   # Area code
    -       # Separator
    \d{4}   # Number
""", re.VERBOSE)

📋 Common Patterns

Email Validation

email_pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"

def is_valid_email(email):
    return bool(re.match(email_pattern, email))

print(is_valid_email("user@example.com"))  # True
print(is_valid_email("invalid.email"))     # False

Phone Number

phone_pattern = r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}"

phones = """
123-456-7890
(123) 456-7890
123.456.7890
123 456 7890
"""

print(re.findall(phone_pattern, phones))

URL

url_pattern = r"https?://(?:www\.)?[\w.-]+(?:/[\w.-]*)*"

text = "Visit https://www.example.com/path or http://test.org"
print(re.findall(url_pattern, text))

Date

# DD/MM/YYYY or DD-MM-YYYY
date_pattern = r"\b(\d{1,2})[/-](\d{1,2})[/-](\d{4})\b"

text = "Dates: 25/12/2023, 01-01-2024"
for match in re.finditer(date_pattern, text):
    day, month, year = match.groups()
    print(f"Day: {day}, Month: {month}, Year: {year}")

IP Address

ip_pattern = r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b"

text = "Server IPs: 192.168.1.1, 10.0.0.1"
print(re.findall(ip_pattern, text))

HTML Tags

# Extract tag content
html = "<p>Hello</p><span>World</span>"
pattern = r"<(\w+)>(.*?)</\1>"

for match in re.finditer(pattern, html):
    print(f"Tag: {match.group(1)}, Content: {match.group(2)}")

⚡ Compiled Patterns

For repeated use, compile patterns for better performance.

import re

# Compile pattern
email_regex = re.compile(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$")

# Use compiled pattern
emails = ["user@example.com", "invalid", "test@test.org"]
for email in emails:
    if email_regex.match(email):
        print(f"Valid: {email}")

📋 Summary

FunctionPurpose
re.search()Find first match
re.match()Match at start
re.findall()Find all matches
re.finditer()Iterator of matches
re.sub()Replace pattern
re.split()Split by pattern
re.compile()Compile pattern

🎯 Next Steps

After mastering regex, proceed to 14_concurrency to learn about threading, multiprocessing, and async/await!

README - Python Tutorial | DeepML