Real-World Applications

Introduction

Regular expressions are not just theoretical concepts - they're powerful tools that solve real-world problems every day. From validating user input to parsing complex data formats, regex is the secret weapon in many developers' toolboxes.

In this final lesson, we'll apply everything we've learned to build practical tools and utilities. You'll see how regex transforms complex text processing tasks into simple, elegant solutions.

Input Validation

Email Validation

Email validation is one of the most common regex applications. Let's build a comprehensive email validator:

import re

class EmailValidator:
    def __init__(self):
        # RFC 5322 compliant email pattern (simplified)
        self.pattern = re.compile(r"""
            ^                       # Start of string
            (?P<local>              # Local part
                [a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+
            )
            @                       # @ symbol
            (?P<domain>             # Domain part
                [a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?
                (?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*
            )
            $                       # End of string
        """, re.VERBOSE | re.IGNORECASE)
    
    def validate(self, email):
        """Validate email format and extract components."""
        match = self.pattern.match(email)
        if match:
            return {
                'valid': True,
                'local_part': match.group('local'),
                'domain': match.group('domain'),
                'full': email
            }
        return {'valid': False, 'error': 'Invalid email format'}

validator = EmailValidator()

emails = [
    "user@example.com",
    "test.email+tag@domain.co.uk",
    "invalid@",
    "user@.com",
    "user..double@domain.com"
]

for email in emails:
    result = validator.validate(email)
    status = "✓ Valid" if result['valid'] else "✗ Invalid"
    print(f"{email}: {status}")
    if result['valid']:
        print(f"  Local: {result['local_part']}, Domain: {result['domain']}")

Password Strength Validation

class PasswordValidator:
    def __init__(self):
        self.patterns = {
            'length': re.compile(r'.{8,}'),
            'uppercase': re.compile(r'[A-Z]'),
            'lowercase': re.compile(r'[a-z]'),
            'digit': re.compile(r'\d'),
            'special': re.compile(r'[!@#$%^&*(),.?":{}|<>]')
        }
    
    def validate(self, password):
        """Check password against multiple criteria."""
        results = {}
        for criterion, pattern in self.patterns.items():
            results[criterion] = bool(pattern.search(password))
        
        results['overall'] = all(results.values())
        return results
    
    def get_feedback(self, password):
        """Provide detailed feedback on password strength."""
        validation = self.validate(password)
        
        feedback = []
        if not validation['length']:
            feedback.append("Password must be at least 8 characters long")
        if not validation['uppercase']:
            feedback.append("Password must contain at least one uppercase letter")
        if not validation['lowercase']:
            feedback.append("Password must contain at least one lowercase letter")
        if not validation['digit']:
            feedback.append("Password must contain at least one digit")
        if not validation['special']:
            feedback.append("Password must contain at least one special character")
        
        return {
            'valid': validation['overall'],
            'feedback': feedback,
            'strength_score': sum(validation.values()) / len(validation)
        }

validator = PasswordValidator()

passwords = ["weak", "Strong123!", "Password123", "Str0ng!Pass"]

for pwd in passwords:
    result = validator.get_feedback(pwd)
    status = "✓ Strong" if result['valid'] else "✗ Weak"
    score = f"{result['strength_score']:.1%}"
    print(f"'{pwd}': {status} ({score})")
    if not result['valid']:
        for issue in result['feedback']:
            print(f"  - {issue}")

Data Extraction and Parsing

Log File Analysis

class LogAnalyzer:
    def __init__(self):
        # Pattern for common log formats
        self.log_pattern = re.compile(r"""
            (?P<timestamp>\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2})  # Timestamp
            \s+\[(?P<level>\w+)\]\s+                             # Log level
            (?P<component>[\w.]+):\s+                           # Component
            (?P<message>.*?)                                    # Message
            (?:\s+Details:\s+(?P<details>.*))?                  # Optional details
        """, re.VERBOSE | re.MULTILINE)
        
        self.error_pattern = re.compile(r'ERROR|CRITICAL|FATAL', re.IGNORECASE)
    
    def parse_logs(self, log_content):
        """Parse log content and extract structured data."""
        logs = []
        for match in self.log_pattern.finditer(log_content):
            log_entry = match.groupdict()
            log_entry['is_error'] = bool(self.error_pattern.search(log_entry['message']))
            logs.append(log_entry)
        return logs
    
    def get_error_summary(self, logs):
        """Generate error summary from parsed logs."""
        errors = [log for log in logs if log['is_error']]
        components = {}
        
        for error in errors:
            comp = error['component']
            components[comp] = components.get(comp, 0) + 1
        
        return {
            'total_errors': len(errors),
            'errors_by_component': components,
            'recent_errors': errors[-5:]  # Last 5 errors
        }

analyzer = LogAnalyzer()

log_content = """
2023-12-01 10:30:45 [INFO] web.server: User login successful
2023-12-01 10:31:22 [ERROR] db.connection: Connection timeout after 30 seconds
2023-12-01 10:32:15 [WARN] auth.service: Invalid password attempt
2023-12-01 10:33:01 [ERROR] api.endpoint: Rate limit exceeded for IP 192.168.1.1
2023-12-01 10:34:20 [INFO] cache.service: Cache cleared successfully
"""

logs = analyzer.parse_logs(log_content)
summary = analyzer.get_error_summary(logs)

print(f"Total logs parsed: {len(logs)}")
print(f"Errors found: {summary['total_errors']}")
print("Errors by component:")
for comp, count in summary['errors_by_component'].items():
    print(f"  {comp}: {count}")

CSV Data Parsing

class CSVParser:
    def __init__(self):
        # Pattern to handle quoted fields with commas
        self.field_pattern = re.compile(r'''
            \s*                                           # Leading whitespace
            (?:
                "((?:[^"]|"")*)"                         # Quoted field
                |
                ([^,]*?)                                 # Unquoted field
            )
            \s*                                           # Trailing whitespace
            (?:,|$)                                       # Comma or end of line
        ''', re.VERBOSE)
    
    def parse_line(self, line):
        """Parse a single CSV line handling quoted fields."""
        fields = []
        pos = 0
        
        while pos < len(line):
            match = self.field_pattern.match(line, pos)
            if not match:
                break
            
            # Get the field value (quoted or unquoted)
            field = match.group(1) if match.group(1) is not None else match.group(2)
            
            # Handle escaped quotes in quoted fields
            if match.group(1) is not None:
                field = field.replace('""', '"')
            
            fields.append(field)
            pos = match.end()
        
        return fields
    
    def parse_csv(self, csv_content):
        """Parse multiple CSV lines."""
        lines = csv_content.strip().split('\n')
        if not lines:
            return []
        
        # First line is header
        headers = self.parse_line(lines[0])
        data = []
        
        for line in lines[1:]:
            if line.strip():  # Skip empty lines
                values = self.parse_line(line)
                if len(values) == len(headers):
                    row = dict(zip(headers, values))
                    data.append(row)
        
        return data

parser = CSVParser()

csv_data = '''Name,Age,City,Occupation
"John Doe",30,"New York, NY",Engineer
"Jane Smith",25,London,"Data Scientist"
"Bob ""Bobby"" Johnson",35,Paris,Designer'''

parsed_data = parser.parse_csv(csv_data)

print("Parsed CSV data:")
for row in parsed_data:
    print(row)

Text Processing and Automation

Code Analysis Tool

class CodeAnalyzer:
    def __init__(self):
        self.patterns = {
            'functions': re.compile(r'def\s+(\w+)\s*\([^)]*\):', re.MULTILINE),
            'classes': re.compile(r'class\s+(\w+)', re.MULTILINE),
            'imports': re.compile(r'^(?:from\s+\w+\s+)?import\s+.*$', re.MULTILINE),
            'comments': re.compile(r'#.*$', re.MULTILINE),
            'docstrings': re.compile(r'""".*?"""', re.DOTALL),
            'variables': re.compile(r'\b[a-zA-Z_][a-zA-Z0-9_]*\s*=', re.MULTILINE),
        }
    
    def analyze_file(self, file_path):
        """Analyze a Python file and extract various metrics."""
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        analysis = {}
        for category, pattern in self.patterns.items():
            matches = pattern.findall(content)
            analysis[category] = {
                'count': len(matches),
                'items': matches[:10]  # First 10 items
            }
        
        # Calculate additional metrics
        analysis['total_lines'] = len(content.split('\n'))
        analysis['code_lines'] = len([line for line in content.split('\n') 
                                    if line.strip() and not line.strip().startswith('#')])
        analysis['comment_ratio'] = analysis['comments']['count'] / analysis['total_lines']
        
        return analysis
    
    def generate_report(self, analysis):
        """Generate a readable analysis report."""
        report = f"""
Code Analysis Report
===================

File Metrics:
- Total lines: {analysis['total_lines']}
- Code lines: {analysis['code_lines']}
- Comments: {analysis['comments']['count']}
- Comment ratio: {analysis['comment_ratio']:.1%}

Code Structure:
- Functions: {analysis['functions']['count']}
- Classes: {analysis['classes']['count']}
- Imports: {analysis['imports']['count']}

Top Functions: {', '.join(analysis['functions']['items'][:5])}
Top Classes: {', '.join(analysis['classes']['items'][:3])}
"""
        return report

# Example usage (assuming we have a Python file)
analyzer = CodeAnalyzer()

# For demonstration, analyze this script itself
try:
    analysis = analyzer.analyze_file(__file__)
    print(analyzer.generate_report(analysis))
except FileNotFoundError:
    print("Could not analyze file - running in interactive mode")

URL and Link Extraction

class LinkExtractor:
    def __init__(self):
        # Comprehensive URL pattern
        self.url_pattern = re.compile(r'''
            https?://                                      # Protocol
            (?:[-\w.])+(?:[:\d]+)?                        # Domain
            (?:/(?:[\w/_.])*(?:\?(?:[\w&=%.])*)?          # Path and query
            (?:#(?:\w*))?)??                               # Fragment
        ''', re.VERBOSE | re.IGNORECASE)
        
        # Email pattern
        self.email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
    
    def extract_from_text(self, text):
        """Extract URLs and emails from text."""
        urls = self.url_pattern.findall(text)
        emails = self.email_pattern.findall(text)
        
        return {
            'urls': list(set(urls)),      # Remove duplicates
            'emails': list(set(emails)),  # Remove duplicates
            'url_count': len(urls),
            'email_count': len(emails)
        }
    
    def extract_from_html(self, html_content):
        """Extract links from HTML content."""
        # Extract href attributes
        href_pattern = re.compile(r'href=["\']([^"\']+)["\']', re.IGNORECASE)
        links = href_pattern.findall(html_content)
        
        # Categorize links
        internal = []
        external = []
        
        for link in links:
            if link.startswith(('http://', 'https://', '//')):
                external.append(link)
            elif link.startswith('/'):
                internal.append(link)
            else:
                internal.append(link)  # Relative links
        
        return {
            'internal_links': internal,
            'external_links': external,
            'total_links': len(links)
        }

extractor = LinkExtractor()

text_content = """
Visit our website at https://www.example.com for more info.
Contact us at support@example.com or sales@company.org.
Check out https://github.com/user/repo for the source code.
"""

results = extractor.extract_from_text(text_content)
print("Text Analysis:")
print(f"URLs found: {results['url_count']}")
print(f"Emails found: {results['email_count']}")
print("URLs:", results['urls'])
print("Emails:", results['emails'])

html_content = '''
<a href="https://www.google.com">Google</a>
<a href="/about">About Us</a>
<a href="contact.html">Contact</a>
<a href="mailto:info@example.com">Email Us</a>
'''

html_results = extractor.extract_from_html(html_content)
print("\nHTML Analysis:")
print(f"Total links: {html_results['total_links']}")
print(f"External links: {len(html_results['external_links'])}")
print(f"Internal links: {len(html_results['internal_links'])}")

Configuration File Parsing

class ConfigParser:
    def __init__(self):
        # Patterns for different config formats
        self.patterns = {
            'key_value': re.compile(r'^\s*(\w+)\s*[:=]\s*(.+?)\s*$', re.MULTILINE),
            'section': re.compile(r'^\s*\[([^\]]+)\]\s*$', re.MULTILINE),
            'comment': re.compile(r'^\s*[;#].*$', re.MULTILINE),
        }
    
    def parse_ini_style(self, content):
        """Parse INI-style configuration files."""
        config = {}
        current_section = 'default'
        
        lines = content.split('\n')
        for line in lines:
            line = line.strip()
            
            # Skip empty lines and comments
            if not line or self.patterns['comment'].match(line):
                continue
            
            # Section header
            section_match = self.patterns['section'].match(line)
            if section_match:
                current_section = section_match.group(1)
                config[current_section] = {}
                continue
            
            # Key-value pair
            kv_match = self.patterns['key_value'].match(line)
            if kv_match:
                key, value = kv_match.groups()
                # Try to convert to appropriate type
                if value.lower() in ('true', 'false'):
                    value = value.lower() == 'true'
                elif value.isdigit():
                    value = int(value)
                elif value.replace('.', '').isdigit():
                    value = float(value)
                
                if current_section not in config:
                    config[current_section] = {}
                config[current_section][key] = value
        
        return config

parser = ConfigParser()

config_content = """
# Database configuration
[database]
host = localhost
port = 5432
name = myapp_db
debug = true

# API settings
[api]
base_url = https://api.example.com
timeout = 30.5
max_retries = 3

; This is a comment
user_agent = MyApp/1.0
"""

parsed_config = parser.parse_ini_style(config_content)
print("Parsed Configuration:")
for section, settings in parsed_config.items():
    print(f"\n[{section}]")
    for key, value in settings.items():
        print(f"{key} = {value} ({type(value).__name__})")

Automation Scripts

File Renamer

import os
import re

class FileRenamer:
    def __init__(self):
        self.patterns = {
            'date_format': re.compile(r'(\d{4})[-_](\d{2})[-_](\d{2})'),
            'spaces': re.compile(r'\s+'),
            'special_chars': re.compile(r'[^\w.-]'),
        }
    
    def sanitize_filename(self, filename):
        """Sanitize filename for better organization."""
        name, ext = os.path.splitext(filename)
        
        # Replace spaces with underscores
        name = self.patterns['spaces'].sub('_', name)
        
        # Remove special characters
        name = self.patterns['special_chars'].sub('', name)
        
        # Convert to lowercase
        name = name.lower()
        
        return name + ext
    
    def rename_files(self, directory, preview=True):
        """Rename files in directory with sanitized names."""
        renamed = []
        
        for filename in os.listdir(directory):
            if os.path.isfile(os.path.join(directory, filename)):
                new_name = self.sanitize_filename(filename)
                
                if new_name != filename:
                    old_path = os.path.join(directory, filename)
                    new_path = os.path.join(directory, new_name)
                    
                    if preview:
                        print(f"Would rename: {filename} -> {new_name}")
                    else:
                        try:
                            os.rename(old_path, new_path)
                            renamed.append((filename, new_name))
                        except OSError as e:
                            print(f"Error renaming {filename}: {e}")
        
        return renamed

renamer = FileRenamer()

# Preview changes
print("Preview of changes:")
renamer.rename_files('.', preview=True)

# Uncomment to actually rename files
# renamed = renamer.rename_files('.', preview=False)
# print(f"Renamed {len(renamed)} files")

Text Template Processor

class TemplateProcessor:
    def __init__(self):
        # Pattern for template variables {{variable}}
        self.var_pattern = re.compile(r'\{\{(\w+)\}\}')
        
        # Pattern for conditional blocks {{#if condition}}...{{/if}}
        self.if_pattern = re.compile(r'\{\{#if\s+(\w+)\}\}(.*?)\{\{/if\}\}', re.DOTALL)
        
        # Pattern for loops {{#each items}}...{{/each}}
        self.each_pattern = re.compile(r'\{\{#each\s+(\w+)\}\}(.*?)\{\{/each\}\}', re.DOTALL)
    
    def process_template(self, template, context):
        """Process template with variables and control structures."""
        
        # Process conditional blocks
        def process_condition(match):
            var_name = match.group(1)
            content = match.group(2)
            
            if context.get(var_name):
                return self.process_variables(content, context)
            return ''
        
        template = self.if_pattern.sub(process_condition, template)
        
        # Process loops
        def process_loop(match):
            var_name = match.group(1)
            template_content = match.group(2)
            
            items = context.get(var_name, [])
            result = []
            
            for item in items:
                # Create context for this item
                item_context = dict(context)
                item_context.update(item)
                result.append(self.process_variables(template_content, item_context))
            
            return ''.join(result)
        
        template = self.each_pattern.sub(process_loop, template)
        
        # Process variables
        return self.process_variables(template, context)
    
    def process_variables(self, text, context):
        """Replace {{variable}} with values from context."""
        def replace_var(match):
            var_name = match.group(1)
            return str(context.get(var_name, ''))
        
        return self.var_pattern.sub(replace_var, text)

processor = TemplateProcessor()

template = """
{{#if show_header}}
# {{title}}

{{/if}}
Hello {{name}}.
{{#each products}}
- {{name}}: ${{price}}
{{/each}}

Total: ${{total}}
"""

context = {
    'show_header': True,
    'title': 'Invoice',
    'name': 'John Doe',
    'products': [
        {'name': 'Widget A', 'price': '10.99'},
        {'name': 'Widget B', 'price': '15.50'},
    ],
    'total': '26.49'
}

result = processor.process_template(template, context)
print(result)

Best Practices for Production Use

1. Pre-compile Patterns

# Good: Compile once, use everywhere
EMAIL_PATTERN = re.compile(r'...', re.IGNORECASE)

def validate_email(email):
    return bool(EMAIL_PATTERN.match(email))

2. Use Raw Strings

# Always use raw strings for regex patterns
pattern = r'\d{3}-\d{3}-\d{4}'  # Good
# pattern = '\d{3}-\d{3}-\d{4}'  # Bad - may cause issues

3. Handle Exceptions

def safe_regex_search(pattern, text, flags=0):
    try:
        compiled = re.compile(pattern, flags)
        return compiled.search(text)
    except re.error as e:
        print(f"Invalid regex pattern: {e}")
        return None

4. Test Thoroughly

def test_regex_patterns():
    """Test regex patterns with various inputs."""
    test_cases = [
        ("valid@example.com", True),
        ("invalid@", False),
        ("test.email@domain.co.uk", True),
    ]
    
    for test_input, expected in test_cases:
        result = validate_email(test_input)
        status = "✓" if result == expected else "✗"
        print(f"{status} {test_input}: expected {expected}, got {result}")

test_regex_patterns()

Key Takeaways

Input validation: Regex excels at validating emails, passwords, phone numbers, and other formats
Data extraction: Parse logs, CSV files, configuration files, and structured text
Text processing: Clean data, extract links, analyze code, and automate repetitive tasks
Template processing: Build simple templating systems for dynamic content generation
Always test thoroughly: Regex patterns can have edge cases that break with unexpected input
Performance matters: Pre-compile patterns and avoid catastrophic backtracking
Combine with other tools: Regex works best when combined with string methods and other Python features

Congratulations! You've completed the Regular Expressions course. You now have the power to process text in ways that seemed impossible before. Regex is a skill that will serve you well in data processing, web development, system administration, and countless other domains. Keep practicing, and you'll find regex becoming an indispensable part of your programming toolkit.