README
13 - Regular Expressions (Regex)
📌 What You'll Learn
- •Pattern matching basics
- •Metacharacters and special sequences
- •Character classes
- •Quantifiers
- •Groups and capturing
- •Common patterns
- •re module functions
🔍 What is Regex?
Regular Expressions are patterns used to match character combinations in strings. Python's re module provides regex support.
import re
# Basic match
text = "Hello, World!"
if re.search(r"World", text):
print("Found!")
📝 Basic Functions
re.search() - Find Pattern
import re
text = "The rain in Spain"
# Returns Match object or None
match = re.search(r"rain", text)
if match:
print(f"Found: {match.group()}")
print(f"Position: {match.start()}-{match.end()}")
re.match() - Match at Start
# Only matches at the beginning
match = re.match(r"The", text) # Match!
match = re.match(r"rain", text) # None (not at start)
re.findall() - Find All Matches
text = "cat bat rat sat"
matches = re.findall(r".at", text)
print(matches) # ['cat', 'bat', 'rat', 'sat']
re.finditer() - Iterator of Matches
for match in re.finditer(r".at", text):
print(f"{match.group()} at {match.start()}")
re.sub() - Replace Pattern
text = "Hello, World!"
result = re.sub(r"World", "Python", text)
print(result) # Hello, Python!
# Replace multiple
text = "one two three"
result = re.sub(r"\s", "-", text)
print(result) # one-two-three
re.split() - Split by Pattern
text = "one, two; three four"
parts = re.split(r"[,;\s]+", text)
print(parts) # ['one', 'two', 'three', 'four']
🔤 Metacharacters
| Character | Meaning |
| --------- | ------------------------------ | ---------------- |
| . | Any character (except newline) |
| ^ | Start of string |
| $ | End of string |
| * | 0 or more repetitions |
| + | 1 or more repetitions |
| ? | 0 or 1 repetition |
| \ | Escape special character |
| | | OR (alternation) |
| () | Group |
| [] | Character class |
| {} | Specific repetitions |
import re
# . matches any character
re.findall(r"c.t", "cat cot cut") # ['cat', 'cot', 'cut']
# ^ and $ anchors
re.search(r"^Hello", "Hello World") # Matches
re.search(r"World$", "Hello World") # Matches
# | alternation
re.findall(r"cat|dog", "I have a cat and a dog") # ['cat', 'dog']
📊 Character Classes
| Class | Meaning |
|---|---|
[abc] | a, b, or c |
[^abc] | NOT a, b, or c |
[a-z] | a through z |
[A-Z] | A through Z |
[0-9] | 0 through 9 |
[a-zA-Z0-9] | Alphanumeric |
# Character class examples
re.findall(r"[aeiou]", "hello") # ['e', 'o']
re.findall(r"[^aeiou]", "hello") # ['h', 'l', 'l']
re.findall(r"[0-9]+", "abc123xyz456") # ['123', '456']
🔢 Special Sequences
| Sequence | Meaning |
|---|---|
\d | Digit [0-9] |
\D | Non-digit |
\w | Word char [a-zA-Z0-9_] |
\W | Non-word char |
\s | Whitespace |
\S | Non-whitespace |
\b | Word boundary |
\B | Non-word boundary |
text = "My phone is 123-456-7890"
re.findall(r"\d+", text) # ['123', '456', '7890']
re.findall(r"\w+", text) # ['My', 'phone', 'is', '123', '456', '7890']
# Word boundaries
re.findall(r"\bcat\b", "cat category scat") # ['cat']
🔁 Quantifiers
| Quantifier | Meaning |
|---|---|
* | 0 or more |
+ | 1 or more |
? | 0 or 1 |
{n} | Exactly n |
{n,} | n or more |
{n,m} | Between n and m |
text = "abbb ac abc abbc"
re.findall(r"ab*", text) # ['abbb', 'a', 'ab', 'abb']
re.findall(r"ab+", text) # ['abbb', 'ab', 'abb']
re.findall(r"ab?", text) # ['ab', 'a', 'ab', 'ab']
re.findall(r"ab{2}", text) # ['abb', 'abb']
re.findall(r"ab{1,2}", text) # ['ab', 'ab', 'abb']
Greedy vs Non-Greedy
text = "<div>content</div>"
# Greedy (default) - matches as much as possible
re.findall(r"<.*>", text) # ['<div>content</div>']
# Non-greedy (?) - matches as little as possible
re.findall(r"<.*?>", text) # ['<div>', '</div>']
👥 Groups and Capturing
Basic Groups
text = "John Smith, Jane Doe"
# Parentheses create groups
matches = re.findall(r"(\w+) (\w+)", text)
print(matches) # [('John', 'Smith'), ('Jane', 'Doe')]
# Access groups
match = re.search(r"(\w+) (\w+)", text)
print(match.group(0)) # John Smith (full match)
print(match.group(1)) # John
print(match.group(2)) # Smith
print(match.groups()) # ('John', 'Smith')
Named Groups
pattern = r"(?P<first>\w+) (?P<last>\w+)"
match = re.search(pattern, "John Smith")
print(match.group('first')) # John
print(match.group('last')) # Smith
print(match.groupdict()) # {'first': 'John', 'last': 'Smith'}
Non-Capturing Groups
# (?:...) groups but doesn't capture
pattern = r"(?:Mr|Mrs|Ms)\. (\w+)"
match = re.search(pattern, "Mr. Smith")
print(match.group(1)) # Smith (not Mr)
Backreferences
# \1, \2, etc. refer to captured groups
pattern = r"(\w+) \1" # Repeated word
text = "the the cat sat sat"
matches = re.findall(pattern, text)
print(matches) # ['the', 'sat']
🚩 Flags
import re
text = "Hello\nworld"
# IGNORECASE (re.I)
re.findall(r"hello", "Hello HELLO", re.IGNORECASE)
# MULTILINE (re.M) - ^ and $ match line start/end
re.findall(r"^\w+", text, re.MULTILINE) # ['Hello', 'world']
# DOTALL (re.S) - . matches newline too
re.findall(r"Hello.world", text, re.DOTALL) # ['Hello\nworld']
# VERBOSE (re.X) - allows comments in pattern
pattern = re.compile(r"""
\d{3} # Area code
- # Separator
\d{4} # Number
""", re.VERBOSE)
📋 Common Patterns
Email Validation
email_pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
def is_valid_email(email):
return bool(re.match(email_pattern, email))
print(is_valid_email("user@example.com")) # True
print(is_valid_email("invalid.email")) # False
Phone Number
phone_pattern = r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}"
phones = """
123-456-7890
(123) 456-7890
123.456.7890
123 456 7890
"""
print(re.findall(phone_pattern, phones))
URL
url_pattern = r"https?://(?:www\.)?[\w.-]+(?:/[\w.-]*)*"
text = "Visit https://www.example.com/path or http://test.org"
print(re.findall(url_pattern, text))
Date
# DD/MM/YYYY or DD-MM-YYYY
date_pattern = r"\b(\d{1,2})[/-](\d{1,2})[/-](\d{4})\b"
text = "Dates: 25/12/2023, 01-01-2024"
for match in re.finditer(date_pattern, text):
day, month, year = match.groups()
print(f"Day: {day}, Month: {month}, Year: {year}")
IP Address
ip_pattern = r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b"
text = "Server IPs: 192.168.1.1, 10.0.0.1"
print(re.findall(ip_pattern, text))
HTML Tags
# Extract tag content
html = "<p>Hello</p><span>World</span>"
pattern = r"<(\w+)>(.*?)</\1>"
for match in re.finditer(pattern, html):
print(f"Tag: {match.group(1)}, Content: {match.group(2)}")
⚡ Compiled Patterns
For repeated use, compile patterns for better performance.
import re
# Compile pattern
email_regex = re.compile(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$")
# Use compiled pattern
emails = ["user@example.com", "invalid", "test@test.org"]
for email in emails:
if email_regex.match(email):
print(f"Valid: {email}")
📋 Summary
| Function | Purpose |
|---|---|
re.search() | Find first match |
re.match() | Match at start |
re.findall() | Find all matches |
re.finditer() | Iterator of matches |
re.sub() | Replace pattern |
re.split() | Split by pattern |
re.compile() | Compile pattern |
🎯 Next Steps
After mastering regex, proceed to 14_concurrency to learn about threading, multiprocessing, and async/await!