- Apply regex in practical data processing scenarios
- Build validation tools for user input
- Create text parsing utilities
- Implement automation scripts with regex
Real-World Applications
Introduction
Regular expressions are not just theoretical concepts - they're powerful tools that solve real-world problems every day. From validating user input to parsing complex data formats, regex is the secret weapon in many developers' toolboxes.
In this final lesson, we'll apply everything we've learned to build practical tools and utilities. You'll see how regex transforms complex text processing tasks into simple, elegant solutions.
Input Validation
Email Validation
Email validation is one of the most common regex applications. Let's build a comprehensive email validator:
import re
class EmailValidator:
def __init__(self):
# RFC 5322 compliant email pattern (simplified)
self.pattern = re.compile(r"""
^ # Start of string
(?P<local> # Local part
[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+
)
@ # @ symbol
(?P<domain> # Domain part
[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?
(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*
)
$ # End of string
""", re.VERBOSE | re.IGNORECASE)
def validate(self, email):
"""Validate email format and extract components."""
match = self.pattern.match(email)
if match:
return {
'valid': True,
'local_part': match.group('local'),
'domain': match.group('domain'),
'full': email
}
return {'valid': False, 'error': 'Invalid email format'}
validator = EmailValidator()
emails = [
"user@example.com",
"test.email+tag@domain.co.uk",
"invalid@",
"user@.com",
"user..double@domain.com"
]
for email in emails:
result = validator.validate(email)
status = "✓ Valid" if result['valid'] else "✗ Invalid"
print(f"{email}: {status}")
if result['valid']:
print(f" Local: {result['local_part']}, Domain: {result['domain']}")
Password Strength Validation
class PasswordValidator:
def __init__(self):
self.patterns = {
'length': re.compile(r'.{8,}'),
'uppercase': re.compile(r'[A-Z]'),
'lowercase': re.compile(r'[a-z]'),
'digit': re.compile(r'\d'),
'special': re.compile(r'[!@#$%^&*(),.?":{}|<>]')
}
def validate(self, password):
"""Check password against multiple criteria."""
results = {}
for criterion, pattern in self.patterns.items():
results[criterion] = bool(pattern.search(password))
results['overall'] = all(results.values())
return results
def get_feedback(self, password):
"""Provide detailed feedback on password strength."""
validation = self.validate(password)
feedback = []
if not validation['length']:
feedback.append("Password must be at least 8 characters long")
if not validation['uppercase']:
feedback.append("Password must contain at least one uppercase letter")
if not validation['lowercase']:
feedback.append("Password must contain at least one lowercase letter")
if not validation['digit']:
feedback.append("Password must contain at least one digit")
if not validation['special']:
feedback.append("Password must contain at least one special character")
return {
'valid': validation['overall'],
'feedback': feedback,
'strength_score': sum(validation.values()) / len(validation)
}
validator = PasswordValidator()
passwords = ["weak", "Strong123!", "Password123", "Str0ng!Pass"]
for pwd in passwords:
result = validator.get_feedback(pwd)
status = "✓ Strong" if result['valid'] else "✗ Weak"
score = f"{result['strength_score']:.1%}"
print(f"'{pwd}': {status} ({score})")
if not result['valid']:
for issue in result['feedback']:
print(f" - {issue}")
Data Extraction and Parsing
Log File Analysis
class LogAnalyzer:
def __init__(self):
# Pattern for common log formats
self.log_pattern = re.compile(r"""
(?P<timestamp>\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2}) # Timestamp
\s+\[(?P<level>\w+)\]\s+ # Log level
(?P<component>[\w.]+):\s+ # Component
(?P<message>.*?) # Message
(?:\s+Details:\s+(?P<details>.*))? # Optional details
""", re.VERBOSE | re.MULTILINE)
self.error_pattern = re.compile(r'ERROR|CRITICAL|FATAL', re.IGNORECASE)
def parse_logs(self, log_content):
"""Parse log content and extract structured data."""
logs = []
for match in self.log_pattern.finditer(log_content):
log_entry = match.groupdict()
log_entry['is_error'] = bool(self.error_pattern.search(log_entry['message']))
logs.append(log_entry)
return logs
def get_error_summary(self, logs):
"""Generate error summary from parsed logs."""
errors = [log for log in logs if log['is_error']]
components = {}
for error in errors:
comp = error['component']
components[comp] = components.get(comp, 0) + 1
return {
'total_errors': len(errors),
'errors_by_component': components,
'recent_errors': errors[-5:] # Last 5 errors
}
analyzer = LogAnalyzer()
log_content = """
2023-12-01 10:30:45 [INFO] web.server: User login successful
2023-12-01 10:31:22 [ERROR] db.connection: Connection timeout after 30 seconds
2023-12-01 10:32:15 [WARN] auth.service: Invalid password attempt
2023-12-01 10:33:01 [ERROR] api.endpoint: Rate limit exceeded for IP 192.168.1.1
2023-12-01 10:34:20 [INFO] cache.service: Cache cleared successfully
"""
logs = analyzer.parse_logs(log_content)
summary = analyzer.get_error_summary(logs)
print(f"Total logs parsed: {len(logs)}")
print(f"Errors found: {summary['total_errors']}")
print("Errors by component:")
for comp, count in summary['errors_by_component'].items():
print(f" {comp}: {count}")
CSV Data Parsing
class CSVParser:
def __init__(self):
# Pattern to handle quoted fields with commas
self.field_pattern = re.compile(r'''
\s* # Leading whitespace
(?:
"((?:[^"]|"")*)" # Quoted field
|
([^,]*?) # Unquoted field
)
\s* # Trailing whitespace
(?:,|$) # Comma or end of line
''', re.VERBOSE)
def parse_line(self, line):
"""Parse a single CSV line handling quoted fields."""
fields = []
pos = 0
while pos < len(line):
match = self.field_pattern.match(line, pos)
if not match:
break
# Get the field value (quoted or unquoted)
field = match.group(1) if match.group(1) is not None else match.group(2)
# Handle escaped quotes in quoted fields
if match.group(1) is not None:
field = field.replace('""', '"')
fields.append(field)
pos = match.end()
return fields
def parse_csv(self, csv_content):
"""Parse multiple CSV lines."""
lines = csv_content.strip().split('\n')
if not lines:
return []
# First line is header
headers = self.parse_line(lines[0])
data = []
for line in lines[1:]:
if line.strip(): # Skip empty lines
values = self.parse_line(line)
if len(values) == len(headers):
row = dict(zip(headers, values))
data.append(row)
return data
parser = CSVParser()
csv_data = '''Name,Age,City,Occupation
"John Doe",30,"New York, NY",Engineer
"Jane Smith",25,London,"Data Scientist"
"Bob ""Bobby"" Johnson",35,Paris,Designer'''
parsed_data = parser.parse_csv(csv_data)
print("Parsed CSV data:")
for row in parsed_data:
print(row)
Text Processing and Automation
Code Analysis Tool
class CodeAnalyzer:
def __init__(self):
self.patterns = {
'functions': re.compile(r'def\s+(\w+)\s*\([^)]*\):', re.MULTILINE),
'classes': re.compile(r'class\s+(\w+)', re.MULTILINE),
'imports': re.compile(r'^(?:from\s+\w+\s+)?import\s+.*$', re.MULTILINE),
'comments': re.compile(r'#.*$', re.MULTILINE),
'docstrings': re.compile(r'""".*?"""', re.DOTALL),
'variables': re.compile(r'\b[a-zA-Z_][a-zA-Z0-9_]*\s*=', re.MULTILINE),
}
def analyze_file(self, file_path):
"""Analyze a Python file and extract various metrics."""
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
analysis = {}
for category, pattern in self.patterns.items():
matches = pattern.findall(content)
analysis[category] = {
'count': len(matches),
'items': matches[:10] # First 10 items
}
# Calculate additional metrics
analysis['total_lines'] = len(content.split('\n'))
analysis['code_lines'] = len([line for line in content.split('\n')
if line.strip() and not line.strip().startswith('#')])
analysis['comment_ratio'] = analysis['comments']['count'] / analysis['total_lines']
return analysis
def generate_report(self, analysis):
"""Generate a readable analysis report."""
report = f"""
Code Analysis Report
===================
File Metrics:
- Total lines: {analysis['total_lines']}
- Code lines: {analysis['code_lines']}
- Comments: {analysis['comments']['count']}
- Comment ratio: {analysis['comment_ratio']:.1%}
Code Structure:
- Functions: {analysis['functions']['count']}
- Classes: {analysis['classes']['count']}
- Imports: {analysis['imports']['count']}
Top Functions: {', '.join(analysis['functions']['items'][:5])}
Top Classes: {', '.join(analysis['classes']['items'][:3])}
"""
return report
# Example usage (assuming we have a Python file)
analyzer = CodeAnalyzer()
# For demonstration, analyze this script itself
try:
analysis = analyzer.analyze_file(__file__)
print(analyzer.generate_report(analysis))
except FileNotFoundError:
print("Could not analyze file - running in interactive mode")
URL and Link Extraction
class LinkExtractor:
def __init__(self):
# Comprehensive URL pattern
self.url_pattern = re.compile(r'''
https?:// # Protocol
(?:[-\w.])+(?:[:\d]+)? # Domain
(?:/(?:[\w/_.])*(?:\?(?:[\w&=%.])*)? # Path and query
(?:#(?:\w*))?)?? # Fragment
''', re.VERBOSE | re.IGNORECASE)
# Email pattern
self.email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
def extract_from_text(self, text):
"""Extract URLs and emails from text."""
urls = self.url_pattern.findall(text)
emails = self.email_pattern.findall(text)
return {
'urls': list(set(urls)), # Remove duplicates
'emails': list(set(emails)), # Remove duplicates
'url_count': len(urls),
'email_count': len(emails)
}
def extract_from_html(self, html_content):
"""Extract links from HTML content."""
# Extract href attributes
href_pattern = re.compile(r'href=["\']([^"\']+)["\']', re.IGNORECASE)
links = href_pattern.findall(html_content)
# Categorize links
internal = []
external = []
for link in links:
if link.startswith(('http://', 'https://', '//')):
external.append(link)
elif link.startswith('/'):
internal.append(link)
else:
internal.append(link) # Relative links
return {
'internal_links': internal,
'external_links': external,
'total_links': len(links)
}
extractor = LinkExtractor()
text_content = """
Visit our website at https://www.example.com for more info.
Contact us at support@example.com or sales@company.org.
Check out https://github.com/user/repo for the source code.
"""
results = extractor.extract_from_text(text_content)
print("Text Analysis:")
print(f"URLs found: {results['url_count']}")
print(f"Emails found: {results['email_count']}")
print("URLs:", results['urls'])
print("Emails:", results['emails'])
html_content = '''
<a href="https://www.google.com">Google</a>
<a href="/about">About Us</a>
<a href="contact.html">Contact</a>
<a href="mailto:info@example.com">Email Us</a>
'''
html_results = extractor.extract_from_html(html_content)
print("\nHTML Analysis:")
print(f"Total links: {html_results['total_links']}")
print(f"External links: {len(html_results['external_links'])}")
print(f"Internal links: {len(html_results['internal_links'])}")
Configuration File Parsing
class ConfigParser:
def __init__(self):
# Patterns for different config formats
self.patterns = {
'key_value': re.compile(r'^\s*(\w+)\s*[:=]\s*(.+?)\s*$', re.MULTILINE),
'section': re.compile(r'^\s*\[([^\]]+)\]\s*$', re.MULTILINE),
'comment': re.compile(r'^\s*[;#].*$', re.MULTILINE),
}
def parse_ini_style(self, content):
"""Parse INI-style configuration files."""
config = {}
current_section = 'default'
lines = content.split('\n')
for line in lines:
line = line.strip()
# Skip empty lines and comments
if not line or self.patterns['comment'].match(line):
continue
# Section header
section_match = self.patterns['section'].match(line)
if section_match:
current_section = section_match.group(1)
config[current_section] = {}
continue
# Key-value pair
kv_match = self.patterns['key_value'].match(line)
if kv_match:
key, value = kv_match.groups()
# Try to convert to appropriate type
if value.lower() in ('true', 'false'):
value = value.lower() == 'true'
elif value.isdigit():
value = int(value)
elif value.replace('.', '').isdigit():
value = float(value)
if current_section not in config:
config[current_section] = {}
config[current_section][key] = value
return config
parser = ConfigParser()
config_content = """
# Database configuration
[database]
host = localhost
port = 5432
name = myapp_db
debug = true
# API settings
[api]
base_url = https://api.example.com
timeout = 30.5
max_retries = 3
; This is a comment
user_agent = MyApp/1.0
"""
parsed_config = parser.parse_ini_style(config_content)
print("Parsed Configuration:")
for section, settings in parsed_config.items():
print(f"\n[{section}]")
for key, value in settings.items():
print(f"{key} = {value} ({type(value).__name__})")
Automation Scripts
File Renamer
import os
import re
class FileRenamer:
def __init__(self):
self.patterns = {
'date_format': re.compile(r'(\d{4})[-_](\d{2})[-_](\d{2})'),
'spaces': re.compile(r'\s+'),
'special_chars': re.compile(r'[^\w.-]'),
}
def sanitize_filename(self, filename):
"""Sanitize filename for better organization."""
name, ext = os.path.splitext(filename)
# Replace spaces with underscores
name = self.patterns['spaces'].sub('_', name)
# Remove special characters
name = self.patterns['special_chars'].sub('', name)
# Convert to lowercase
name = name.lower()
return name + ext
def rename_files(self, directory, preview=True):
"""Rename files in directory with sanitized names."""
renamed = []
for filename in os.listdir(directory):
if os.path.isfile(os.path.join(directory, filename)):
new_name = self.sanitize_filename(filename)
if new_name != filename:
old_path = os.path.join(directory, filename)
new_path = os.path.join(directory, new_name)
if preview:
print(f"Would rename: {filename} -> {new_name}")
else:
try:
os.rename(old_path, new_path)
renamed.append((filename, new_name))
except OSError as e:
print(f"Error renaming {filename}: {e}")
return renamed
renamer = FileRenamer()
# Preview changes
print("Preview of changes:")
renamer.rename_files('.', preview=True)
# Uncomment to actually rename files
# renamed = renamer.rename_files('.', preview=False)
# print(f"Renamed {len(renamed)} files")
Text Template Processor
class TemplateProcessor:
def __init__(self):
# Pattern for template variables {{variable}}
self.var_pattern = re.compile(r'\{\{(\w+)\}\}')
# Pattern for conditional blocks {{#if condition}}...{{/if}}
self.if_pattern = re.compile(r'\{\{#if\s+(\w+)\}\}(.*?)\{\{/if\}\}', re.DOTALL)
# Pattern for loops {{#each items}}...{{/each}}
self.each_pattern = re.compile(r'\{\{#each\s+(\w+)\}\}(.*?)\{\{/each\}\}', re.DOTALL)
def process_template(self, template, context):
"""Process template with variables and control structures."""
# Process conditional blocks
def process_condition(match):
var_name = match.group(1)
content = match.group(2)
if context.get(var_name):
return self.process_variables(content, context)
return ''
template = self.if_pattern.sub(process_condition, template)
# Process loops
def process_loop(match):
var_name = match.group(1)
template_content = match.group(2)
items = context.get(var_name, [])
result = []
for item in items:
# Create context for this item
item_context = dict(context)
item_context.update(item)
result.append(self.process_variables(template_content, item_context))
return ''.join(result)
template = self.each_pattern.sub(process_loop, template)
# Process variables
return self.process_variables(template, context)
def process_variables(self, text, context):
"""Replace {{variable}} with values from context."""
def replace_var(match):
var_name = match.group(1)
return str(context.get(var_name, ''))
return self.var_pattern.sub(replace_var, text)
processor = TemplateProcessor()
template = """
{{#if show_header}}
# {{title}}
{{/if}}
Hello {{name}}.
{{#each products}}
- {{name}}: ${{price}}
{{/each}}
Total: ${{total}}
"""
context = {
'show_header': True,
'title': 'Invoice',
'name': 'John Doe',
'products': [
{'name': 'Widget A', 'price': '10.99'},
{'name': 'Widget B', 'price': '15.50'},
],
'total': '26.49'
}
result = processor.process_template(template, context)
print(result)
Best Practices for Production Use
1. Pre-compile Patterns
# Good: Compile once, use everywhere
EMAIL_PATTERN = re.compile(r'...', re.IGNORECASE)
def validate_email(email):
return bool(EMAIL_PATTERN.match(email))
2. Use Raw Strings
# Always use raw strings for regex patterns
pattern = r'\d{3}-\d{3}-\d{4}' # Good
# pattern = '\d{3}-\d{3}-\d{4}' # Bad - may cause issues
3. Handle Exceptions
def safe_regex_search(pattern, text, flags=0):
try:
compiled = re.compile(pattern, flags)
return compiled.search(text)
except re.error as e:
print(f"Invalid regex pattern: {e}")
return None
4. Test Thoroughly
def test_regex_patterns():
"""Test regex patterns with various inputs."""
test_cases = [
("valid@example.com", True),
("invalid@", False),
("test.email@domain.co.uk", True),
]
for test_input, expected in test_cases:
result = validate_email(test_input)
status = "✓" if result == expected else "✗"
print(f"{status} {test_input}: expected {expected}, got {result}")
test_regex_patterns()
Key Takeaways
- Input validation: Regex excels at validating emails, passwords, phone numbers, and other formats
- Data extraction: Parse logs, CSV files, configuration files, and structured text
- Text processing: Clean data, extract links, analyze code, and automate repetitive tasks
- Template processing: Build simple templating systems for dynamic content generation
- Always test thoroughly: Regex patterns can have edge cases that break with unexpected input
- Performance matters: Pre-compile patterns and avoid catastrophic backtracking
- Combine with other tools: Regex works best when combined with string methods and other Python features
Congratulations! You've completed the Regular Expressions course. You now have the power to process text in ways that seemed impossible before. Regex is a skill that will serve you well in data processing, web development, system administration, and countless other domains. Keep practicing, and you'll find regex becoming an indispensable part of your programming toolkit.
