- Process and clean real-world text data
- Extract information from strings
- Handle common text processing tasks
- Build practical text manipulation skills
Working with Text
You've learned the building blocks – indexing, slicing, methods, and formatting. Now it's time to put them together! Real-world text processing involves combining multiple techniques to solve practical problems: cleaning messy data, extracting useful information, and transforming content.
Think of this lesson as your workshop where you'll practice building complete solutions. By the end, you'll handle text like a pro!
Cleaning Text Data
Real-world text is messy. Here's how to tame it:
The Complete Cleaning Pipeline
def clean_text(text):
"""Clean and normalize text."""
# Step 1: Remove extra whitespace
text = text.strip()
# Step 2: Normalize case
text = text.lower()
# Step 3: Remove extra internal spaces
text = " ".join(text.split())
# Step 4: Remove unwanted characters
text = text.replace("\n", " ").replace("\t", " ")
return text
# Test it
messy = " Hello \n World \t Python "
clean = clean_text(messy)
print(f"'{clean}'") # 'hello world python'
Visual: The Cleaning Pipeline
Text Cleaning Pipeline
INPUT: " Hello \n World \t "
Step 1: strip() → "Hello \n World \t"
Step 2: lower() → "hello \n world \t"
Step 3: split() → ["hello", "world"]
Step 4: " ".join() → "hello world"
OUTPUT: "hello world"
Extracting Information
Pull specific data from text:
Email Parsing
def parse_email(email):
"""Extract username and domain from email."""
if "@" not in email or email.count("@") != 1:
return None
email = email.strip().lower()
username, domain = email.split("@")
return {
"full": email,
"username": username,
"domain": domain,
"extension": domain.split(".")[-1]
}
# Test
info = parse_email("Alice.Smith@Example.COM")
print(info)
# {'full': 'alice.smith@example.com', 'username': 'alice.smith',
# 'domain': 'example.com', 'extension': 'com'}
Name Parsing
def parse_full_name(name):
"""Parse a full name into parts."""
name = name.strip().title()
parts = name.split()
if len(parts) == 1:
return {"first": parts[0], "middle": "", "last": ""}
elif len(parts) == 2:
return {"first": parts[0], "middle": "", "last": parts[1]}
else:
return {
"first": parts[0],
"middle": " ".join(parts[1:-1]),
"last": parts[-1]
}
print(parse_full_name("john doe"))
# {'first': 'John', 'middle': '', 'last': 'Doe'}
print(parse_full_name("mary jane watson"))
# {'first': 'Mary', 'middle': 'Jane', 'last': 'Watson'}
URL Parsing
def parse_url(url):
"""Extract components from a URL."""
url = url.strip()
# Remove protocol
if "://" in url:
protocol, rest = url.split("://", 1)
else:
protocol, rest = "http", url
# Split path from domain
if "/" in rest:
domain, path = rest.split("/", 1)
path = "/" + path
else:
domain, path = rest, "/"
return {
"protocol": protocol,
"domain": domain,
"path": path
}
print(parse_url("https://www.example.com/page/about"))
# {'protocol': 'https', 'domain': 'www.example.com', 'path': '/page/about'}
Validating Input
Check if text meets requirements:
Username Validation
def is_valid_username(username):
"""
Validate username:
- 3-20 characters
- Only letters, numbers, underscores
- Must start with a letter
"""
# Check length
if not (3 <= len(username) <= 20):
return False, "Must be 3-20 characters"
# Check first character
if not username[0].isalpha():
return False, "Must start with a letter"
# Check all characters
for char in username:
if not (char.isalnum() or char == "_"):
return False, f"Invalid character: {char}"
return True, "Valid username"
# Test
print(is_valid_username("alice_123")) # (True, 'Valid username')
print(is_valid_username("123alice")) # (False, 'Must start with a letter')
print(is_valid_username("al")) # (False, 'Must be 3-20 characters')
print(is_valid_username("alice@home")) # (False, 'Invalid character: @')
Password Strength Checker
def check_password_strength(password):
"""
Check password strength and provide feedback.
"""
issues = []
score = 0
# Length check
if len(password) >= 8:
score += 1
else:
issues.append("At least 8 characters")
# Has uppercase
if any(c.isupper() for c in password):
score += 1
else:
issues.append("Add uppercase letter")
# Has lowercase
if any(c.islower() for c in password):
score += 1
else:
issues.append("Add lowercase letter")
# Has digit
if any(c.isdigit() for c in password):
score += 1
else:
issues.append("Add a number")
# Has special char
special = "!@#$%^&*()_+-=[]{}|;:,.<>?"
if any(c in special for c in password):
score += 1
else:
issues.append("Add special character")
# Determine strength
strength = ["Very Weak", "Weak", "Fair", "Good", "Strong"][score]
return {
"score": score,
"strength": strength,
"issues": issues
}
print(check_password_strength("password"))
# {'score': 2, 'strength': 'Fair', 'issues': ['Add uppercase letter', 'Add a number', 'Add special character']}
print(check_password_strength("MyP@ss123"))
# {'score': 5, 'strength': 'Strong', 'issues': []}
Text Transformation
Convert text between formats:
Slug Generator (URL-Friendly Text)
def create_slug(title):
"""Convert title to URL-friendly slug."""
# Lowercase
slug = title.lower()
# Replace spaces with hyphens
slug = slug.replace(" ", "-")
# Keep only alphanumeric and hyphens
slug = "".join(c for c in slug if c.isalnum() or c == "-")
# Remove multiple consecutive hyphens
while "--" in slug:
slug = slug.replace("--", "-")
# Remove leading/trailing hyphens
slug = slug.strip("-")
return slug
print(create_slug("Hello World!")) # hello-world
print(create_slug("10 Tips for Python")) # 10-tips-for-python
print(create_slug("What's New in 2024?")) # whats-new-in-2024
Title Case Smart Conversion
def smart_title(text):
"""
Convert to title case but keep small words lowercase
(except at the beginning).
"""
small_words = {"a", "an", "the", "and", "but", "or", "for",
"nor", "on", "at", "to", "by", "in", "of"}
words = text.lower().split()
result = []
for i, word in enumerate(words):
if i == 0 or word not in small_words:
result.append(word.capitalize())
else:
result.append(word)
return " ".join(result)
print(smart_title("THE LORD OF THE RINGS"))
# "The Lord of the Rings"
print(smart_title("a tale of two cities"))
# "A Tale of Two Cities"
Text Analysis
Analyze and understand text:
Word Frequency Counter
def word_frequency(text):
"""Count word frequency in text."""
# Clean and split
words = text.lower().split()
# Remove punctuation from words
cleaned = []
for word in words:
cleaned_word = "".join(c for c in word if c.isalnum())
if cleaned_word:
cleaned.append(cleaned_word)
# Count frequencies
freq = {}
for word in cleaned:
freq[word] = freq.get(word, 0) + 1
# Sort by frequency
sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
return dict(sorted_freq)
text = "To be or not to be, that is the question. To be is to exist."
print(word_frequency(text))
# {'to': 4, 'be': 3, 'is': 2, 'or': 1, 'not': 1, ...}
Text Statistics
def text_stats(text):
"""Calculate comprehensive text statistics."""
lines = text.split("\n")
words = text.split()
sentences = text.count(".") + text.count("!") + text.count("?")
return {
"characters": len(text),
"characters_no_spaces": len(text.replace(" ", "")),
"words": len(words),
"lines": len(lines),
"sentences": sentences or 1,
"avg_word_length": sum(len(w) for w in words) / len(words) if words else 0,
"avg_words_per_sentence": len(words) / sentences if sentences else len(words)
}
sample = """Python is a programming language.
It is easy to learn.
Many developers love Python!"""
stats = text_stats(sample)
for key, value in stats.items():
print(f"{key}: {value:.2f}" if isinstance(value, float) else f"{key}: {value}")
Real-World Project: Contact Card Generator
Let's combine everything into a practical project:
def create_contact_card(data):
"""
Create a formatted contact card from raw data.
data: dict with name, email, phone, title (all optional)
"""
# Clean and validate data
name = data.get("name", "Unknown").strip().title()
email = data.get("email", "").strip().lower()
phone = data.get("phone", "").strip()
title = data.get("title", "").strip().title()
# Format phone (remove non-digits, then format)
phone_digits = "".join(c for c in phone if c.isdigit())
if len(phone_digits) == 10:
phone = f"({phone_digits[:3]}) {phone_digits[3:6]}-{phone_digits[6:]}"
# Build card
width = 40
card = []
card.append("" + "" * (width - 2) + "")
card.append(f"{name:^{width-2}}")
if title:
card.append(f"{title:^{width-2}}")
card.append("" + "" * (width - 2) + "")
if email:
card.append(f" {email:<{width-6}} ")
if phone:
card.append(f" {phone:<{width-6}} ")
card.append("" + "" * (width - 2) + "")
return "\n".join(card)
# Test it
contact = {
"name": " ALICE smith ",
"email": "Alice@Example.COM",
"phone": "123-456-7890",
"title": "software engineer"
}
print(create_contact_card(contact))
Key Takeaways
Remember These Points
CLEANING: Chain methods for multi-step cleaning
text.strip().lower().replace()
EXTRACTING: Split strategically to get parts
email.split("@"), url.split("/")
VALIDATING: Use isalpha(), isdigit(), isalnum()
Check length, characters, patterns
TRANSFORMING: Combine techniques creatively
Slugs, title case, formatting
ANALYZING: Build frequency counts and statistics
split(), count(), dictionaries
COMBINE IT ALL for real-world solutions!
Module Complete!
Congratulations! You've mastered strings in Python!
You can now:
- Create and manipulate strings with confidence
- Use powerful string methods for any text task
- Format beautiful output with f-strings
- Build real-world text processing solutions
In the next module, you'll learn about File Handling – how to read, write, and process files!
