Advanced Patterns and Flags

Introduction

Regular expressions are incredibly powerful, but to truly master them, you need to understand the flags that modify their behavior and the advanced patterns that handle complex scenarios. Think of regex flags as "settings" that change how the engine interprets your patterns, and advanced patterns as the "special moves" that solve tricky text processing problems.

In this lesson, we'll explore regex flags, Unicode support, conditional patterns, and performance optimization techniques that will make you a regex expert.

Regex Flags

Flags modify how regex patterns are interpreted. You can pass them as the third argument to regex functions or use inline flags.

re.IGNORECASE (re.I)

Makes pattern matching case-insensitive:

import re

pattern = r'python'
text = "Python is a programming language."

# Without flag
match1 = re.search(pattern, text)
print(match1)  # None

# With flag
match2 = re.search(pattern, text, re.IGNORECASE)
print(match2.group())  # "Python"

# Inline flag
match3 = re.search(r'(?i)python', text)
print(match3.group())  # "Python"

re.MULTILINE (re.M)

Changes how ^ and $ work - they match line boundaries instead of string boundaries:

text = """Line 1: Start
Line 2: Middle
Line 3: End"""

# Without MULTILINE
matches1 = re.findall(r'^Line \d+', text)
print(matches1)  # ['Line 1']

# With MULTILINE
matches2 = re.findall(r'^Line \d+', text, re.MULTILINE)
print(matches2)  # ['Line 1', 'Line 2', 'Line 3']

# Inline flag
matches3 = re.findall(r'(?m)^Line \d+', text)
print(matches3)  # ['Line 1', 'Line 2', 'Line 3']

re.DOTALL (re.S)

Makes the dot . match any character, including newlines:

text = """<div>
Content here
</div>"""

# Without DOTALL
match1 = re.search(r'<div>.*</div>', text)
print(match1)  # None (dot doesn't match newlines)

# With DOTALL
match2 = re.search(r'<div>.*</div>', text, re.DOTALL)
print(match2.group())  # "<div>\nContent here\n</div>"

# Inline flag
match3 = re.search(r'(?s)<div>.*</div>', text)
print(match3.group())  # "<div>\nContent here\n</div>"

re.VERBOSE (re.X)

Allows you to write regex patterns with whitespace and comments for readability:

# Without VERBOSE - hard to read
phone_pattern = r'^\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})$'

# With VERBOSE - much clearer
phone_pattern_verbose = re.compile(r"""
    ^           # Start of string
    \(?         # Optional opening parenthesis
    (\d{3})     # Area code (3 digits)
    \)?         # Optional closing parenthesis
    [-.\s]?     # Optional separator (dash, dot, or space)
    (\d{3})     # Exchange (3 digits)
    [-.\s]?     # Optional separator
    (\d{4})     # Number (4 digits)
    $           # End of string
""", re.VERBOSE)

test_numbers = ["555-123-4567", "(555) 123-4567", "555.123.4567"]
for number in test_numbers:
    if phone_pattern_verbose.match(number):
        print(f"Valid: {number}")

Combining Flags

You can combine multiple flags:

# Multiple flags
pattern = re.compile(r'python', re.IGNORECASE | re.MULTILINE)

# Inline multiple flags
pattern = re.compile(r'(?im)python')  # Same as above

Advanced Character Classes

Unicode Support

Python's regex engine supports Unicode character classes:

# Match any letter (including Unicode)
pattern = r'\w+'
text = "Hello 世界 café naïve"

matches = re.findall(pattern, text)
print(matches)  # ['Hello', '世界', 'café', 'naïve']

# Unicode categories
patterns = [
    r'\p{L}+',      # Letters
    r'\p{N}+',      # Numbers
    r'\p{Z}+',      # Spaces
    r'\p{P}+',      # Punctuation
]

# Note: \p{} syntax may not be available in all regex engines
# Use re.UNICODE flag for better Unicode support

POSIX Character Classes

# POSIX-style character classes
patterns = [
    r'[[:alpha:]]+',    # Alphabetic characters
    r'[[:digit:]]+',    # Digits
    r'[[:space:]]+',    # Whitespace
    r'[[:alnum:]]+',    # Alphanumeric
]

text = "Hello 123 World!"
for pattern in patterns:
    try:
        matches = re.findall(pattern, text)
        print(f"{pattern}: {matches}")
    except:
        print(f"{pattern}: Not supported")

Custom Character Classes

# Vowels
vowels = re.compile(r'[aeiou]', re.IGNORECASE)

# Hexadecimal digits
hex_digits = re.compile(r'[0-9a-fA-F]+')

# Custom ranges
custom = re.compile(r'[a-zA-Z0-9_$]+')  # Valid identifier characters

text = "Function $var123 contains hex: FF00 and vowels: aeiou"
print("Vowels:", vowels.findall(text))
print("Hex:", hex_digits.findall(text))
print("Identifiers:", custom.findall(text))

Conditional Patterns

Conditional Groups

# Match different formats based on conditions
# Syntax: (?(id/name)yes-pattern|no-pattern)

# Example: Match quoted strings with proper quotes
pattern = r'"([^"]*)"|\'([^\']*)\''
text = 'He said "Hello" and \'Goodbye\''

matches = re.findall(pattern, text)
print(matches)  # [('Hello', ''), ('', 'Goodbye')]

Recursive Patterns

For complex nested structures (limited support in Python):

# Simple balanced parentheses (not fully recursive in Python)
pattern = r'\((?:[^()]|(?R))*\)'
text = "(a(b)c)d"

match = re.search(pattern, text)
if match:
    print(match.group())  # "(a(b)c)"

Performance Optimization

Pre-compiling Patterns

import time

# Bad - compiles every time
def bad_function(text):
    for _ in range(1000):
        re.search(r'\b\w+\b', text)

# Good - compile once
word_pattern = re.compile(r'\b\w+\b')

def good_function(text):
    for _ in range(1000):
        word_pattern.search(text)

text = "This is a test string with many words" * 100

start = time.time()
bad_function(text)
print(f"Bad: {time.time() - start:.3f}s")

start = time.time()
good_function(text)
print(f"Good: {time.time() - start:.3f}s")

Avoiding Common Pitfalls

1. Catastrophic Backtracking

# Problematic pattern - can cause exponential time
bad_pattern = r'(a*)*b'
text = 'a' * 30

# This will be very slow or crash
# match = re.search(bad_pattern, text)

# Better - use atomic groups or possessive quantifiers
good_pattern = r'(?>a*)b'  # Atomic group
# or
good_pattern2 = r'a*b'     # Simpler equivalent

2. Greedy vs Lazy Matching

text = '<div>Content 1</div><div>Content 2</div>'

# Greedy - matches too much
greedy = re.search(r'<div>.*</div>', text)
print(greedy.group())  # Matches entire string

# Lazy - matches minimally
lazy = re.search(r'<div>.*?</div>', text)
print(lazy.group())    # "<div>Content 1</div>"

3. Using the Right Tool

text = "simple word matching"

# Regex for simple tasks (overkill)
words1 = re.findall(r'\b\w+\b', text)

# String methods are often better
words2 = text.split()

print(words1 == words2)  # True

Regex Debugging

import re

# Use re.DEBUG flag to see how regex is compiled
pattern = re.compile(r'(?P<word>\b\w+\b)', re.DEBUG)

# Test with different inputs to understand behavior
test_cases = ["hello", "hello world", "123 abc"]
for test in test_cases:
    match = pattern.search(test)
    if match:
        print(f"'{test}' -> {match.groupdict()}")

Real-World Advanced Examples

Email Validation with Unicode Support

def validate_international_email(email):
    """Validate email with international characters."""
    pattern = re.compile(r"""
        ^                           # Start
        (?P<local>                  # Local part
            [a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+
        )
        @                           # @
        (?P<domain>                 # Domain part
            [a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?
            (?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*
        )
        $                           # End
    """, re.VERBOSE | re.IGNORECASE)
    
    return bool(pattern.match(email))

emails = ["user@example.com", "用户@例子.中国", "test.email+tag@domain.co.uk"]
for email in emails:
    print(f"{email}: {'Valid' if validate_international_email(email) else 'Invalid'}")

Log File Analysis with Multiple Flags

def parse_complex_log(log_text):
    """Parse complex log entries with various formats."""
    
    # Pattern with multiple flags
    pattern = re.compile(r"""
        (?P<timestamp>\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2})  # Timestamp
        \s+\[(?P<level>\w+)\]\s+                              # Log level
        (?P<component>[\w.]+):\s+                            # Component
        (?P<message>.*?)                                      # Message (lazy)
        (?:\s+Details:\s+(?P<details>.*))?                   # Optional details
    """, re.VERBOSE | re.MULTILINE | re.DOTALL)
    
    matches = pattern.finditer(log_text)
    for match in matches:
        print(match.groupdict())

log_text = """
2023-12-01 10:30:45 [INFO] web.server: User login successful
2023-12-01 10:31:22 [ERROR] db.connection: Connection failed
Details: Timeout after 30 seconds
2023-12-01 10:32:15 [WARN] auth.service: Invalid password attempt
"""

parse_complex_log(log_text)

Code Analysis Tool

def analyze_python_code(code):
    """Analyze Python code for various patterns."""
    
    patterns = {
        'functions': re.compile(r'def\s+(\w+)\s*\([^)]*\):', re.MULTILINE),
        'classes': re.compile(r'class\s+(\w+)', re.MULTILINE),
        'imports': re.compile(r'^(?:from\s+\w+\s+)?import\s+.*$', re.MULTILINE),
        'comments': re.compile(r'#.*$', re.MULTILINE),
        'strings': re.compile(r'(["\'])(?:(?=(\\?))\2.)*?\1', re.DOTALL),
    }
    
    analysis = {}
    for category, pattern in patterns.items():
        matches = pattern.findall(code)
        analysis[category] = matches
    
    return analysis

code = '''
import re
from collections import defaultdict

class CodeAnalyzer:
    def __init__(self):
        self.patterns = {}
    
    def analyze(self, code):
        # Analyze the code
        return len(code)

# This is a comment
def main():
    analyzer = CodeAnalyzer()
    result = analyzer.analyze("print('hello')")
    return result
'''

analysis = analyze_python_code(code)
for category, items in analysis.items():
    print(f"{category.title()}: {len(items)} found")
    if len(items) <= 5:  # Don't print too many
        print(f"  {items}")

Key Points to Remember

Regex flags modify matching behavior: re.IGNORECASE, re.MULTILINE, re.DOTALL, re.VERBOSE
Inline flags (?i), (?m), (?s), (?x) provide per-pattern control
Unicode support enables international character matching
Performance matters: Pre-compile patterns, avoid catastrophic backtracking
Choose the right tool: String methods for simple tasks, regex for complex patterns
Verbose mode makes complex patterns maintainable with comments and whitespace

You've now mastered the core of regular expressions - from basic patterns to advanced flags and optimization techniques. In our final lesson, we'll apply everything we've learned to real-world scenarios, building practical tools for data validation, text processing, and automation that you can use in your own projects.