Python RegEx

🔍 What is RegEx?

RegEx (Regular Expressions) is like a super-powered search tool! Instead of just looking for exact words, you can search for patterns. Think of it as creating a recipe that describes what kind of text you're looking for.


import re

# Find all phone numbers in text
text = "Call me at 555-1234 or 555-5678"
phone_numbers = re.findall(r'\d{3}-\d{4}', text)
print(phone_numbers)  # ['555-1234', '555-5678']

Pattern

Search

Text

Processing

Powerful

Tool

Quick Reference: RegEx Functions

🔍

re.search()

Find first match

match = re.search(r'\d+', "I have 5 cats")
if match:
    print(match.group())  # "5"

📋

re.findall()

Find all matches

numbers = re.findall(r'\d+', "I have 5 cats and 3 dogs")
print(numbers)  # ['5', '3']

🔄

re.sub()

Replace matches

result = re.sub(r'\d+', 'X', "I have 5 cats")
print(result)  # "I have X cats"

✂️

re.split()

Split by pattern

parts = re.split(r'[,;]', "a,b;c,d")
print(parts)  # ['a', 'b', 'c', 'd']

Lesson 1: Finding Simple Text

Let's start by searching for exact words in text:

Basic Text Search

import re

# Text to search in
text = "I love Python programming. Python is awesome!"

# Check if "Python" exists in the text
if re.search("Python", text):
    print("✅ Found 'Python' in the text!")
else:
    print("❌ 'Python' not found")

# Find all occurrences of "Python"
matches = re.findall("Python", text)
print(f"Found 'Python' {len(matches)} times")

# Find all occurrences (case insensitive)
matches_ignore_case = re.findall("python", text, re.IGNORECASE)
print(f"Found 'python' (any case) {len(matches_ignore_case)} times")

Lesson 2: Finding Numbers

Let's learn to find numbers in text using special patterns:

Finding Numbers

import re

# Text with various numbers
text = "I have 5 apples, 12 oranges, and 100 grapes."

# \d means "any digit" (0-9)
# \d+ means "one or more digits"
numbers = re.findall(r'\d+', text)
print("Numbers found:", numbers)  # ['5', '12', '100']

# Convert to actual numbers
actual_numbers = [int(num) for num in numbers]
print("As integers:", actual_numbers)  # [5, 12, 100]
print("Total:", sum(actual_numbers))   # 117

Finding Decimal Numbers

# Text with decimal numbers
price_text = "Apples cost $3.50, oranges cost $2.25, and bananas cost $1.99"

# Find decimal numbers (digits, dot, digits)
prices = re.findall(r'\d+\.\d+', price_text)
print("Prices found:", prices)  # ['3.50', '2.25', '1.99']

# Calculate total cost
total = sum(float(price) for price in prices)
print(f"Total cost: ${total:.2f}")  # Total cost: $7.74

Lesson 3: Finding Email Addresses

Let's find email addresses in text using patterns:

Simple Email Finder

import re

# Text with email addresses
contact_info = """
Contact us at support@company.com or sales@business.org.
You can also reach admin@website.net for technical issues.
"""

# Simple email pattern: word@word.word
# \w+ means "one or more word characters" (letters, numbers, underscore)
email_pattern = r'\w+@\w+\.\w+'
emails = re.findall(email_pattern, contact_info)

print("Email addresses found:")
for email in emails:
    print(f"📧 {email}")

# Output:
# 📧 support@company.com
# 📧 sales@business.org
# 📧 admin@website.net

RegEx Pattern Syntax

Regular expressions use special characters to define patterns:

🎯 Basic Patterns

Any character except newline

a.c

matches "abc", "axc"

Start of string


             ^Hello

matches "Hello World"

End of string


             World$

matches "Hello World"

Zero or more occurrences

ab*

matches "a", "ab", "abb"

One or more occurrences

ab+

matches "ab", "abb"

Zero or one occurrence

ab?

matches "a", "ab"

📊 Character Classes

\d

Any digit (0-9)

\d+

matches "123"

\w

Any word character (a-z, A-Z, 0-9, _)

\w+

matches "hello_123"

\s

Any whitespace character

\s+

matches spaces, tabs


             [abc]

Any character in brackets


             [aeiou]

matches vowels


             [a-z]

Any character in range


             [0-9]

matches digits


             [^abc]

Any character NOT in brackets


             [^0-9]

matches non-digits

Testing Different Patterns

import re

# Test text
text = "My phone is 555-1234 and my email is user@example.com"

# Find digits
digits = re.findall(r'\d', text)
print("Individual digits:", digits)  # ['5', '5', '5', '1', '2', '3', '4']

# Find groups of digits
digit_groups = re.findall(r'\d+', text)
print("Digit groups:", digit_groups)  # ['555', '1234']

# Find word characters
words = re.findall(r'\w+', text)
print("Words:", words)  # ['My', 'phone', 'is', '555', '1234', 'and', 'my', 'email', 'is', 'user', 'example', 'com']

# Find phone number pattern (3 digits, dash, 4 digits)
phone = re.findall(r'\d{3}-\d{4}', text)
print("Phone numbers:", phone)  # ['555-1234']

Lesson 5: Replacing Text with Patterns

You can use RegEx to find and replace text patterns:

Text Replacement

import re

# Hide phone numbers for privacy
text = "Call me at 555-1234 or text 555-5678"

# Replace phone numbers with "XXX-XXXX"
hidden = re.sub(r'\d{3}-\d{4}', 'XXX-XXXX', text)
print("Original:", text)
print("Hidden:", hidden)
# Output: Call me at XXX-XXXX or text XXX-XXXX

# Remove extra spaces
messy_text = "This   has    too     many    spaces"
clean_text = re.sub(r'\s+', ' ', messy_text)
print("Messy:", messy_text)
print("Clean:", clean_text)
# Output: This has too many spaces

Formatting Phone Numbers

# Format phone numbers consistently
phone_numbers = ["5551234567", "555.123.4567", "555-123-4567", "(555) 123-4567"]

def format_phone(phone):
    # Remove all non-digits
    digits_only = re.sub(r'\D', '', phone)
    
    # Check if it's a valid 10-digit number
    if len(digits_only) == 10:
        # Format as (XXX) XXX-XXXX
        return f"({digits_only[:3]}) {digits_only[3:6]}-{digits_only[6:]}"
    else:
        return "Invalid phone number"

print("Formatted phone numbers:")
for phone in phone_numbers:
    formatted = format_phone(phone)
    print(f"{phone:<15} → {formatted}")

# Output:
# 5551234567      → (555) 123-4567
# 555.123.4567    → (555) 123-4567
# 555-123-4567    → (555) 123-4567
# (555) 123-4567  → (555) 123-4567

Practice Project: Text Cleaner

Let's build a simple text cleaner that uses RegEx to fix common text problems:

🔹 Basic Text Cleaning

import re

def clean_basic_text(text):
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    # Fix multiple punctuation
    text = re.sub(r'[!?\.]{2,}', lambda m: m.group()[0], text)
    return text.strip()

🔹 Advanced Cleaning Features

def clean_advanced(text):
    # Fix spacing around punctuation
    text = re.sub(r'\s+([,.!?])', r'\1', text)
    # Capitalize sentences
    text = re.sub(r'(^|[.!?]\s+)([a-z])', 
                 lambda m: m.group(1) + m.group(2).upper(), text)
    return text

🔹 Information Extraction

def extract_text_info(text):
    return {
        'emails': re.findall(r'\w+@\w+\.\w+', text),
        'phones': re.findall(r'\d{3}-\d{4}', text),
        'numbers': [int(n) for n in re.findall(r'\d+', text)],
        'word_count': len(re.findall(r'\w+', text))
    }

🔹 Usage Example

# Example text
text = "hello!!!   contact   me@email.com   or 555-1234..."

# Clean and analyze
clean = clean_advanced(clean_basic_text(text))
info = extract_text_info(text)

print(f"Cleaned: {clean}")
print(f"Info: {info}")

Common Mistakes to Avoid

🚫 Forgetting the 'r' prefix

Wrong: re.findall('\d+', text)

Right: re.findall(r'\d+', text)

Always use r"pattern" for raw strings to avoid escape issues

🚫 Not escaping special characters

Wrong: re.findall(r'$5.99', text)

Right: re.findall(r'\$5\.99', text)

Use backslash \ to escape special characters like $ and .

🚫 Making patterns too complex

Complex: r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'

Simple: r'\w+@\w+\.\w+'

Start simple and add complexity only when needed