Practical Examples¶
Data Cleaning¶
Normalizing Whitespace¶
import re
raw = " Hello,\t\tWorld! \n Too many spaces. "
# Collapse all whitespace to single space, strip edges
cleaned = re.sub(r'\s+', ' ', raw).strip()
print(cleaned)
# 'Hello, World! Too many spaces.'
Removing Non-ASCII Characters¶
import re
text = "Café résumé naïve"
# Remove non-ASCII
re.sub(r'[^\x00-\x7F]+', '', text)
# 'Caf rsum nave'
# Keep only alphanumeric and basic punctuation
re.sub(r'[^a-zA-Z0-9\s.,!?-]', '', text)
# 'Caf rsum nave'
Cleaning Financial Data¶
import re
prices = ["\$1,234.56", "€ 2,100.00", "¥10,000", "\$42", "N/A"]
def extract_number(s):
"""Extract numeric value from a price string."""
cleaned = re.sub(r'[^\d.]', '', s)
if cleaned:
return float(cleaned)
return None
for p in prices:
print(f"{p:>12} → {extract_number(p)}")
# \$1,234.56 → 1234.56
# € 2,100.00 → 2100.0
# ¥10,000 → 10000.0
# \$42 → 42.0
# N/A → None
Validation Patterns¶
Email Validation¶
import re
# Simplified email pattern (covers most common formats)
EMAIL_RE = re.compile(r"""
^
[\w.+-]+ # local part: word chars, dots, plus, hyphen
@ # at sign
[\w-]+ # domain name
(?:\.[\w-]+)* # optional subdomains
\. # dot before TLD
[a-zA-Z]{2,} # TLD (at least 2 letters)
$
""", re.VERBOSE)
tests = [
"user@example.com",
"first.last+tag@sub.domain.co.uk",
"not-an-email",
"@missing-local.com",
"missing-domain@",
"spaces not@allowed.com",
]
for t in tests:
valid = "✓" if EMAIL_RE.match(t) else "✗"
print(f" {valid} {t}")
# ✓ user@example.com
# ✓ first.last+tag@sub.domain.co.uk
# ✗ not-an-email
# ✗ @missing-local.com
# ✗ missing-domain@
# ✗ spaces not@allowed.com
Date Validation¶
import re
DATE_RE = re.compile(r"""
^
(?P<year>\d{4}) # year: 4 digits
[-/] # separator
(?P<month>0[1-9]|1[0-2]) # month: 01-12
[-/] # separator
(?P<day>0[1-9]|[12]\d|3[01]) # day: 01-31
$
""", re.VERBOSE)
tests = ["2024-01-15", "2024/12/31", "2024-13-01", "2024-00-15", "24-01-15"]
for t in tests:
m = DATE_RE.match(t)
status = f"✓ {m.groupdict()}" if m else "✗"
print(f" {status} ← {t}")
# ✓ {'year': '2024', 'month': '01', 'day': '15'} ← 2024-01-15
# ✓ {'year': '2024', 'month': '12', 'day': '31'} ← 2024/12/31
# ✗ ← 2024-13-01
# ✗ ← 2024-00-15
# ✗ ← 24-01-15
IP Address Validation¶
import re
# Match IPv4 addresses (0.0.0.0 to 255.255.255.255)
IP_RE = re.compile(r"""
^
(?:
(?:25[0-5]|2[0-4]\d|[01]?\d\d?) # octet: 0-255
\. # dot
){3}
(?:25[0-5]|2[0-4]\d|[01]?\d\d?) # last octet (no trailing dot)
$
""", re.VERBOSE)
tests = ["192.168.1.1", "255.255.255.255", "0.0.0.0", "256.1.1.1", "1.2.3"]
for t in tests:
valid = "✓" if IP_RE.match(t) else "✗"
print(f" {valid} {t}")
# ✓ 192.168.1.1
# ✓ 255.255.255.255
# ✓ 0.0.0.0
# ✗ 256.1.1.1
# ✗ 1.2.3
Log Parsing¶
Apache/Nginx Access Log¶
import re
LOG_RE = re.compile(r"""
(?P<ip>[\d.]+) # IP address
\s+-\s+-\s+ # identd and user (usually - -)
\[(?P<date>[^\]]+)\] # date in brackets
\s+
"(?P<method>\w+) # HTTP method
\s+(?P<path>\S+) # request path
\s+(?P<proto>[^"]+)" # protocol
\s+(?P<status>\d{3}) # status code
\s+(?P<size>\d+|-) # response size
""", re.VERBOSE)
log_line = '192.168.1.100 - - [15/Jan/2024:10:30:45 +0000] "GET /api/data HTTP/1.1" 200 1234'
match = LOG_RE.match(log_line)
if match:
info = match.groupdict()
for k, v in info.items():
print(f" {k:>8}: {v}")
# ip: 192.168.1.100
# date: 15/Jan/2024:10:30:45 +0000
# method: GET
# path: /api/data
# proto: HTTP/1.1
# status: 200
# size: 1234
Python Traceback Extraction¶
import re
traceback_text = """
Traceback (most recent call last):
File "main.py", line 42, in process_data
result = compute(data)
File "utils.py", line 15, in compute
return data / 0
ZeroDivisionError: division by zero
"""
# Extract file, line number, and function name
FRAME_RE = re.compile(r'File "(?P<file>[^"]+)", line (?P<line>\d+), in (?P<func>\w+)')
for m in FRAME_RE.finditer(traceback_text):
print(m.groupdict())
# {'file': 'main.py', 'line': '42', 'func': 'process_data'}
# {'file': 'utils.py', 'line': '15', 'func': 'compute'}
# Extract the exception type and message
exc_match = re.search(r'^(\w+Error): (.+)$', traceback_text, re.M)
if exc_match:
print(f"Exception: {exc_match.group(1)}") # ZeroDivisionError
print(f"Message: {exc_match.group(2)}") # division by zero
Text Transformation¶
Markdown to Plain Text¶
import re
md = """
# Main Title
This is **bold** and *italic* text.
Here's a [link](https://example.com) and `inline code`.
- List item 1
- List item 2
"""
def md_to_plain(text):
text = re.sub(r'^#+\s+', '', text, flags=re.M) # headers
text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) # bold
text = re.sub(r'\*(.+?)\*', r'\1', text) # italic
text = re.sub(r'\[(.+?)\]\(.+?\)', r'\1', text) # links
text = re.sub(r'`(.+?)`', r'\1', text) # inline code
text = re.sub(r'^[-*]\s+', '• ', text, flags=re.M) # list items
return text.strip()
print(md_to_plain(md))
# Main Title
# This is bold and italic text.
# Here's a link and inline code.
# • List item 1
# • List item 2
CamelCase ↔ snake_case¶
import re
def camel_to_snake(name):
"""Convert camelCase or PascalCase to snake_case."""
# Insert underscore before uppercase letters
s = re.sub(r'(?<=[a-z0-9])([A-Z])', r'_\1', name)
# Handle consecutive uppercase (e.g., HTTPResponse → http_response)
s = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', s)
return s.lower()
def snake_to_camel(name):
"""Convert snake_case to camelCase."""
parts = name.split('_')
return parts[0] + ''.join(p.capitalize() for p in parts[1:])
print(camel_to_snake("myVariableName")) # my_variable_name
print(camel_to_snake("HTTPResponse")) # http_response
print(camel_to_snake("getHTTPSUrl")) # get_https_url
print(snake_to_camel("my_variable_name")) # myVariableName
print(snake_to_camel("http_response")) # httpResponse
Financial Data Patterns¶
Ticker and Price Extraction¶
import re
text = """
AAPL closed at \$182.63 (+1.25%)
GOOGL rose to \$141.80 (-0.34%)
MSFT ended at \$378.91 (+0.89%)
"""
STOCK_RE = re.compile(r'(?P<ticker>[A-Z]{1,5})\s+\w+\s+\w+\s+\$(?P<price>[\d.]+)\s+\((?P<change>[+-][\d.]+%)\)')
for m in STOCK_RE.finditer(text):
d = m.groupdict()
print(f" {d['ticker']:>5}: ${d['price']} ({d['change']})")
# AAPL: \$182.63 (+1.25%)
# GOOGL: \$141.80 (-0.34%)
# MSFT: \$378.91 (+0.89%)
CSV Line Parser¶
import re
def parse_csv_line(line):
"""Parse a CSV line handling quoted fields with commas."""
pattern = r'(?:"([^"]*(?:""[^"]*)*)"|([^,]*))'
fields = []
for quoted, unquoted in re.findall(pattern, line):
field = quoted.replace('""', '"') if quoted else unquoted
fields.append(field.strip())
return [f for f in fields if f or f == '']
line = 'John,"New York, NY",42,"He said ""hi"""'
print(parse_csv_line(line))
# ['John', 'New York, NY', '42', 'He said "hi"']
Common Regex Recipes¶
| Task | Pattern |
|---|---|
| Integer | r'[+-]?\d+' |
| Float | r'[+-]?\d*\.?\d+' |
| Scientific notation | r'[+-]?\d*\.?\d+(?:[eE][+-]?\d+)?' |
| Hex color | r'#[0-9a-fA-F]{3,6}\b' |
| US phone | r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}' |
| URL | r'https?://\S+' |
| Email (simple) | r'[\w.+-]+@[\w-]+\.[\w.]+' |
| IP address | r'\b(?:\d{1,3}\.){3}\d{1,3}\b' |
| ISO date | r'\d{4}-\d{2}-\d{2}' |
| Blank lines | r'^\s*$' with re.M |
| Duplicate words | r'\b(\w+)\s+\1\b' |
| HTML tags | r'<[^>]+>' |
Regex Is Not Always the Answer
For structured formats like JSON, XML, HTML, or CSV, prefer dedicated parsers (json, xml.etree, html.parser, csv modules). Regex is best for semi-structured text, validation, and simple extraction tasks.
Summary¶
| Category | Key Takeaway |
|---|---|
| Data cleaning | re.sub() for normalization; extract numbers from messy strings |
| Validation | re.fullmatch() with verbose patterns for readable validators |
| Log parsing | Named groups + finditer() for structured extraction |
| Transformation | Backreferences in re.sub() for reordering and reformatting |
| Financial data | Combine named groups with specific numeric patterns |
| When NOT to use | Prefer dedicated parsers for JSON, XML, HTML, CSV |
Runnable Example: practical_applications_tutorial.py¶
"""
Python Regular Expressions - Tutorial 07: Practical Applications
================================================================
This tutorial demonstrates real-world applications of regex patterns.
DIFFICULTY: ADVANCED
"""
import re
# =============================================================================
# Main
# =============================================================================
if __name__ == "__main__":
print("="*70)
print("REAL-WORLD REGEX PATTERNS")
print("="*70)
# Email validation
email_pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
emails = ["user@example.com", "invalid@", "test@test.co.uk"]
print("\nEmail Validation:")
for email in emails:
valid = bool(re.match(email_pattern, email))
print(f" {'✓' if valid else '✗'} {email}")
# URL validation
url_pattern = r"^https?://(?:www\.)?[\w.-]+\.\w+(?:/[\w./?%&=-]*)?$"
urls = ["https://example.com", "http://www.test.org/page", "invalid"]
print("\nURL Validation:")
for url in urls:
valid = bool(re.match(url_pattern, url))
print(f" {'✓' if valid else '✗'} {url}")
# Phone number formatting
phone_text = "Call 5551234567 or 8005551234"
phone_pattern = r"(\d{3})(\d{3})(\d{4})"
formatted = re.sub(phone_pattern, r"() -", phone_text)
print(f"\nPhone Formatting:\n Before: {phone_text}\n After: {formatted}")
# Extract all links from HTML
html = '<a href="http://example.com">Link</a> <a href="http://test.org">Test</a>'
link_pattern = r'href="(https?://[^"]+)"'
links = re.findall(link_pattern, html)
print(f"\nExtracted Links: {links}")
# Data extraction from logs
log = "2024-03-15 14:30:45 ERROR Connection failed"
log_pattern = r"(\d{4}-\d{2}-\d{2}) (\d{2}:\d{2}:\d{2}) (\w+) (.+)"
match = re.search(log_pattern, log)
if match:
print(f"\nLog Parsing:")
print(f" Date: {match.group(1)}")
print(f" Time: {match.group(2)}")
print(f" Level: {match.group(3)}")
print(f" Message: {match.group(4)}")
# Text cleaning
text = "Hello!!! How are you??? "
cleaned = re.sub(r'\s+', ' ', text).strip()
cleaned = re.sub(r'([!?]){2,}', r'', cleaned)
print(f"\nText Cleaning:\n Before: '{text}'\n After: '{cleaned}'")
# IP address validation
ip_pattern = r"^(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)$"
ips = ["192.168.1.1", "256.1.1.1", "10.0.0.1"]
print("\nIP Address Validation:")
for ip in ips:
valid = bool(re.match(ip_pattern, ip))
print(f" {'✓' if valid else '✗'} {ip}")
print("\n" + "="*70)
print("END OF TUTORIAL - All concepts completed!")
print("="*70)
Runnable Example: regex_web_scraping_tutorial.py¶
"""
Topic 7.10 - Regex for Web Scraping and Data Extraction Tutorial
Real-world regex patterns for extracting structured data from
semi-structured web content (HTML, JSON-in-HTML, API responses).
When APIs aren't available, web scraping often requires parsing raw HTML
or embedded JSON to extract the data you need. Regex is the tool for this:
find patterns in messy text and pull out clean, structured fields.
This tutorial builds a complete pipeline:
raw text → regex extraction → deduplication → pandas DataFrame → CSV
Inspired by a Starbucks store location scraper that parses store data
(ID, name, latitude, longitude, city, state, zip) from web responses.
Learning Objectives:
- Writing multi-group regex patterns for structured data extraction
- Extracting repeated records from large text blocks
- Non-greedy matching (.*?) to avoid over-matching
- Named groups for readability
- Deduplication patterns for scraped data
- Building DataFrames from extracted data
- Combining regex with pandas for data pipelines
Prerequisites:
- ch07/regex (regex basics, groups, quantifiers)
- ch10/core (pandas DataFrame basics)
- ch15/networking (HTTP requests, web scraping)
Author: Python Educator
Date: 2024
"""
import re
import csv
import json
from typing import Any
# ============================================================================
# PART 1: BEGINNER - Extracting Fields from Structured Text
# ============================================================================
def demonstrate_basic_field_extraction():
"""
Extract key-value pairs from semi-structured text using regex groups.
Web APIs and HTML pages often embed data in predictable formats.
Regex groups let you extract specific fields from these patterns.
"""
print("=" * 70)
print("BEGINNER: Extracting Fields from Structured Text")
print("=" * 70)
# --- Extracting from JSON-like text ---
print("\n1. Extracting from JSON-like text in HTML")
print("-" * 50)
# Simulate text embedded in a web page (common in store locators)
html_fragment = '''
<script>var storeData = {"storeNumber":"12345","name":"Downtown Plaza",
"coordinates":{"latitude":34.0522,"longitude":-118.2437},
"address":{"city":"Los Angeles","state":"CA","postalCode":"90012"}};
</script>
'''
# Extract store number
store_num = re.search(r'"storeNumber":"(\d+)"', html_fragment)
if store_num:
print(f" Store number: {store_num.group(1)}")
# Extract store name
store_name = re.search(r'"name":"([^"]+)"', html_fragment)
if store_name:
print(f" Store name : {store_name.group(1)}")
# Extract latitude and longitude
lat = re.search(r'"latitude":([-\d.]+)', html_fragment)
lon = re.search(r'"longitude":([-\d.]+)', html_fragment)
if lat and lon:
print(f" Latitude : {lat.group(1)}")
print(f" Longitude : {lon.group(1)}")
# Extract city
city = re.search(r'"city":"([^"]+)"', html_fragment)
if city:
print(f" City : {city.group(1)}")
# --- Non-greedy vs greedy matching ---
print(f"\n2. Non-greedy matching: .*? vs .*")
print("-" * 50)
text = '"name":"First Store","other":"data","name":"Second Store"'
# Greedy: .* grabs as much as possible
greedy = re.search(r'"name":"(.*)"', text)
print(f" Greedy .* : {greedy.group(1) if greedy else 'None'}")
# Gets: First Store","other":"data","name":"Second Store
# Non-greedy: .*? grabs as little as possible
nongreedy = re.search(r'"name":"(.*?)"', text)
print(f" Non-greedy .*? : {nongreedy.group(1) if nongreedy else 'None'}")
# Gets: First Store
print("\n Key insight: Use .*? when extracting from quoted values")
print(" to stop at the FIRST closing quote, not the last one.")
# --- Named groups for readability ---
print(f"\n3. Named groups (?P<name>...) for readability")
print("-" * 50)
log_line = '2024-03-15 14:30:45 [ERROR] Server: Connection refused (port 5432)'
pattern = (
r'(?P<date>\d{4}-\d{2}-\d{2}) '
r'(?P<time>\d{2}:\d{2}:\d{2}) '
r'\[(?P<level>\w+)\] '
r'(?P<source>\w+): '
r'(?P<message>.+)'
)
match = re.search(pattern, log_line)
if match:
print(f" date = {match.group('date')}")
print(f" time = {match.group('time')}")
print(f" level = {match.group('level')}")
print(f" source = {match.group('source')}")
print(f" message = {match.group('message')}")
print(f"\n As dict: {match.groupdict()}")
print("\n" + "=" * 70 + "\n")
# ============================================================================
# PART 2: INTERMEDIATE - Multi-Record Extraction from Web Content
# ============================================================================
def demonstrate_multi_record_extraction():
"""
Extract multiple records from a block of text using regex.
Web pages often contain many similar data records (store listings,
product cards, search results). The strategy:
1. Use re.findall() or re.finditer() to find all record boundaries
2. Extract fields from each record with groups
"""
print("=" * 70)
print("INTERMEDIATE: Extracting Multiple Records")
print("=" * 70)
# Simulate a web response with multiple store records
# This mimics what a store locator API might return embedded in HTML
response_text = '''
{"stores":[
{"storeNumber":"10234","name":"Main St & 5th","coordinates":
{"latitude":34.0522,"longitude":-118.2437},"address":
{"city":"Los Angeles","countrySubdivisionCode":"CA","postalCode":"90012"},"slug":"main-st"},
{"storeNumber":"10567","name":"Hollywood & Vine","coordinates":
{"latitude":34.1017,"longitude":-118.3267},"address":
{"city":"Hollywood","countrySubdivisionCode":"CA","postalCode":"90028"},"slug":"hollywood"},
{"storeNumber":"10891","name":"Venice Boardwalk","coordinates":
{"latitude":33.9850,"longitude":-118.4695},"address":
{"city":"Venice","countrySubdivisionCode":"CA","postalCode":"90291"},"slug":"venice"},
{"storeNumber":"10234","name":"Main St & 5th","coordinates":
{"latitude":34.0522,"longitude":-118.2437},"address":
{"city":"Los Angeles","countrySubdivisionCode":"CA","postalCode":"90012"},"slug":"duplicate"}
]}
'''
# --- Step 1: Split into individual records ---
print("\n1. Finding individual records with re.findall()")
print("-" * 50)
# Each record runs from "storeNumber" to "slug"
record_pattern = r'"storeNumber":.*?"slug"'
records = re.findall(record_pattern, response_text, re.DOTALL)
print(f" Found {len(records)} records")
# --- Step 2: Extract fields from each record ---
print(f"\n2. Extracting fields with multi-group pattern")
print("-" * 50)
# This pattern uses 7 capturing groups to extract all fields at once
field_pattern = (
r'"storeNumber":"(.*?)"' # Group 1: store ID
r'.*?"name":"(.*?)"' # Group 2: store name
r'.*?"latitude":([\d.-]+)' # Group 3: latitude
r'.*?"longitude":([\d.-]+)' # Group 4: longitude
r'.*?"city":"(.*?)"' # Group 5: city
r'.*?"countrySubdivisionCode":"(.*?)"' # Group 6: state
r'.*?"postalCode":"(.*?)"' # Group 7: zip code
)
extracted = []
for record in records:
match = re.search(field_pattern, record, re.DOTALL)
if match:
extracted.append(match.groups())
store_id, name, lat, lon, city, state, zip_code = match.groups()
print(f" Store {store_id}: {name}")
print(f" Location: ({lat}, {lon})")
print(f" Address: {city}, {state} {zip_code}")
print()
# --- Alternative: re.findall with groups returns list of tuples ---
print("3. Shortcut: re.findall() with groups")
print("-" * 50)
# When the pattern has groups, findall returns list of tuples
all_matches = re.findall(field_pattern, response_text, re.DOTALL)
print(f" re.findall returned {len(all_matches)} tuples")
for m in all_matches:
print(f" {m[0]}: {m[1]} in {m[4]}, {m[5]}")
print("\n" + "=" * 70 + "\n")
return all_matches
# ============================================================================
# PART 3: INTERMEDIATE - Deduplication Patterns
# ============================================================================
def demonstrate_deduplication(records: list[tuple]) -> list[tuple]:
"""
Deduplicate scraped records.
Web scraping often produces duplicates because:
- The same item appears on multiple pages
- Search regions overlap (nearby zip codes)
- Pagination issues
Args:
records: List of tuples (store_id, name, lat, lon, city, state, zip)
Returns:
Deduplicated list
"""
print("=" * 70)
print("INTERMEDIATE: Deduplication Patterns for Scraped Data")
print("=" * 70)
print(f"\n Records before dedup: {len(records)}")
# --- Method 1: Track seen IDs ---
print(f"\n1. Method 1: Track seen IDs with a set")
print("-" * 50)
seen_ids = set()
unique_records = []
duplicates = 0
for record in records:
store_id = record[0]
if store_id not in seen_ids:
seen_ids.add(store_id)
unique_records.append(record)
else:
duplicates += 1
print(f" Duplicate found: store {store_id} ({record[1]})")
print(f" After dedup: {len(unique_records)} unique, {duplicates} removed")
# --- Method 2: dict.fromkeys() preserves order ---
print(f"\n2. Method 2: dict keyed by ID (preserves insertion order)")
print("-" * 50)
store_dict = {}
for record in records:
store_id = record[0]
if store_id not in store_dict:
store_dict[store_id] = record
unique_via_dict = list(store_dict.values())
print(f" Result: {len(unique_via_dict)} unique records")
# --- Method 3: Using pandas (for large datasets) ---
print(f"\n3. Method 3: pandas drop_duplicates (best for large data)")
print("-" * 50)
print(" import pandas as pd")
print(" df = pd.DataFrame(records, columns=[...])")
print(" df = df.drop_duplicates(subset=['store_id'])")
print(" # Handles thousands of records efficiently")
# --- Why sets work well here ---
print(f"\n4. Why set-based dedup is O(n)")
print("-" * 50)
print(" set.add() and 'in set' are both O(1) average case")
print(" Looping through n records: n × O(1) = O(n)")
print(" vs. list-based 'if id in seen_list': O(n) per check → O(n²)")
print(" For 10,000 records: set is ~10,000x faster than list!")
print("\n" + "=" * 70 + "\n")
return unique_records
# ============================================================================
# PART 4: INTERMEDIATE - Building a DataFrame from Scraped Data
# ============================================================================
def demonstrate_dataframe_construction(records: list[tuple]):
"""
Convert extracted records into a structured pandas DataFrame.
This is the typical final step in a scraping pipeline:
raw text → regex → tuples → DataFrame → CSV/analysis
"""
print("=" * 70)
print("INTERMEDIATE: Building a DataFrame from Scraped Data")
print("=" * 70)
columns = ["store_id", "name", "latitude", "longitude",
"city", "state", "zip_code"]
# --- Method 1: From list of tuples ---
print(f"\n1. DataFrame from list of tuples")
print("-" * 50)
# We'll use a simulated dataset here (real scraping would use actual data)
sample_records = [
("10234", "Main St & 5th", "34.0522", "-118.2437", "Los Angeles", "CA", "90012"),
("10567", "Hollywood & Vine", "34.1017", "-118.3267", "Hollywood", "CA", "90028"),
("10891", "Venice Boardwalk", "33.9850", "-118.4695", "Venice", "CA", "90291"),
("11023", "Pasadena Old Town", "34.1478", "-118.1445", "Pasadena", "CA", "91101"),
("11156", "Santa Monica Pier", "34.0094", "-118.4973", "Santa Monica","CA", "90401"),
("11289", "Beverly Hills Plaza", "34.0736", "-118.4004", "Beverly Hills","CA","90210"),
("11422", "Downtown Arts Dist", "34.0402", "-118.2351", "Los Angeles", "CA", "90013"),
("11555", "Glendale Galleria", "34.1456", "-118.2551", "Glendale", "CA", "91210"),
("11688", "Burbank Media Dist", "34.1808", "-118.3090", "Burbank", "CA", "91502"),
("11821", "Long Beach Pike", "33.7675", "-118.1924", "Long Beach", "CA", "90802"),
]
# Direct construction — most common approach
# (importing pandas only if available, since this is a regex tutorial)
try:
import pandas as pd
df = pd.DataFrame(sample_records, columns=columns)
# Convert numeric columns from strings
df["latitude"] = df["latitude"].astype(float)
df["longitude"] = df["longitude"].astype(float)
print(f" Shape: {df.shape}")
print(f" Dtypes:\n{df.dtypes}")
print(f"\n Preview:")
print(df[["store_id", "name", "city", "zip_code"]].to_string(index=False))
# --- Quick analysis ---
print(f"\n2. Quick analysis of scraped data")
print("-" * 50)
print(f" Total stores : {len(df)}")
print(f" Unique cities : {df['city'].nunique()}")
print(f" Stores per city :")
city_counts = df["city"].value_counts()
for city, count in city_counts.items():
print(f" {city}: {count}")
print(f"\n Geographic bounds:")
print(f" Lat range : {df['latitude'].min():.4f} to {df['latitude'].max():.4f}")
print(f" Lon range : {df['longitude'].min():.4f} to {df['longitude'].max():.4f}")
# --- Save to CSV ---
print(f"\n3. Saving to CSV")
print("-" * 50)
csv_path = "/tmp/scraped_stores.csv"
df.to_csv(csv_path, index=False)
print(f" Saved {len(df)} records to {csv_path}")
# Verify by reading back
df_check = pd.read_csv(csv_path)
print(f" Verified: read back {len(df_check)} records")
except ImportError:
print(" (pandas not available — showing CSV approach instead)")
# Fallback: pure csv module
csv_path = "/tmp/scraped_stores.csv"
with open(csv_path, "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(columns)
writer.writerows(sample_records)
print(f" Wrote {len(sample_records)} records to {csv_path}")
print("\n" + "=" * 70 + "\n")
# ============================================================================
# PART 5: ADVANCED - Complete Regex Scraping Pipeline
# ============================================================================
def demonstrate_complete_pipeline():
"""
A complete pipeline that combines all the patterns:
1. Simulate fetching a web response
2. Extract records with regex
3. Clean and normalize fields
4. Deduplicate
5. Build DataFrame
6. Export to CSV
This mirrors how real web scrapers work.
"""
print("=" * 70)
print("ADVANCED: Complete Regex Scraping Pipeline")
print("=" * 70)
# --- Step 1: Simulated web response ---
# In practice, this comes from requests.get(url).text
print("\n--- Step 1: Receive web response ---")
response_text = '''
window.__INITIAL_STATE__ = {"stores":{"items":[
{"storeNumber":"10234","name":"Main St \\u0026 5th Ave","coordinates":
{"latitude":34.0522,"longitude":-118.2437},"address":
{"city":"Los Angeles","countrySubdivisionCode":"CA",
"postalCode":"90012-1234"},"slug":"main-st"},
{"storeNumber":"10567","name":"Hollywood \\u0026 Vine","coordinates":
{"latitude":34.1017,"longitude":-118.3267},"address":
{"city":"Hollywood ","countrySubdivisionCode":"CA",
"postalCode":"90028"},"slug":"hw-vine"},
{"storeNumber":"10891","name":"Venice Boardwalk","coordinates":
{"latitude":33.9850,"longitude":-118.4695},"address":
{"city":"Venice","countrySubdivisionCode":"CA",
"postalCode":"90291"},"slug":"venice-bw"},
{"storeNumber":"10234","name":"Main St \\u0026 5th Ave","coordinates":
{"latitude":34.0522,"longitude":-118.2437},"address":
{"city":"Los Angeles","countrySubdivisionCode":"CA",
"postalCode":"90012-1234"},"slug":"main-st-dup"}
]}};
'''
print(f" Response length: {len(response_text)} chars")
# --- Step 2: Extract records ---
print("\n--- Step 2: Extract records with regex ---")
record_pattern = r'"storeNumber":.*?"slug"'
raw_records = re.findall(record_pattern, response_text, re.DOTALL)
print(f" Found {len(raw_records)} raw records")
field_pattern = (
r'"storeNumber":"(?P<id>.*?)"'
r'.*?"name":"(?P<name>.*?)"'
r'.*?"latitude":(?P<lat>[\d.-]+)'
r'.*?"longitude":(?P<lon>[\d.-]+)'
r'.*?"city":"(?P<city>.*?)"'
r'.*?"countrySubdivisionCode":"(?P<state>.*?)"'
r'.*?"postalCode":"(?P<zip>.*?)"'
)
extracted = []
for record in raw_records:
match = re.search(field_pattern, record, re.DOTALL)
if match:
extracted.append(match.groupdict())
print(f" Extracted {len(extracted)} records")
for rec in extracted:
print(f" {rec['id']}: {rec['name']} — {rec['city']}, {rec['state']}")
# --- Step 3: Clean and normalize ---
print("\n--- Step 3: Clean and normalize fields ---")
def clean_record(record: dict) -> dict:
"""
Clean a single scraped record.
Common cleaning tasks:
- Decode unicode escapes (\\u0026 → &)
- Strip whitespace
- Normalize zip codes (take first 5 digits)
- Collapse multiple spaces
"""
cleaned = {}
cleaned["store_id"] = record["id"].strip()
# Decode unicode escapes like \u0026
cleaned["name"] = (
record["name"]
.encode("utf-8")
.decode("unicode_escape")
.strip()
)
# Collapse multiple spaces
cleaned["name"] = re.sub(r"\s+", " ", cleaned["name"])
cleaned["latitude"] = float(record["lat"])
cleaned["longitude"] = float(record["lon"])
cleaned["city"] = record["city"].strip()
cleaned["state"] = record["state"].strip()
# Normalize zip to 5 digits
cleaned["zip_code"] = record["zip"][:5]
return cleaned
cleaned_records = [clean_record(r) for r in extracted]
print(f" Cleaned {len(cleaned_records)} records")
for rec in cleaned_records:
print(f" {rec['store_id']}: {rec['name']} — {rec['city']}, "
f"{rec['state']} {rec['zip_code']}")
# --- Step 4: Deduplicate ---
print("\n--- Step 4: Deduplicate ---")
seen = set()
unique = []
for rec in cleaned_records:
if rec["store_id"] not in seen:
seen.add(rec["store_id"])
unique.append(rec)
else:
print(f" Removed duplicate: {rec['store_id']} ({rec['name']})")
print(f" {len(cleaned_records)} → {len(unique)} after dedup")
# --- Step 5: Build DataFrame ---
print("\n--- Step 5: Build DataFrame ---")
try:
import pandas as pd
df = pd.DataFrame(unique)
print(df.to_string(index=False))
except ImportError:
for rec in unique:
print(f" {rec}")
# --- Step 6: Save to CSV ---
print("\n--- Step 6: Save to CSV ---")
csv_path = "/tmp/scraped_stores_pipeline.csv"
fieldnames = ["store_id", "name", "latitude", "longitude",
"city", "state", "zip_code"]
with open(csv_path, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(unique)
print(f" Saved {len(unique)} records to {csv_path}")
print("\n" + "=" * 70 + "\n")
# ============================================================================
# PART 6: ADVANCED - Regex Patterns for Common Web Formats
# ============================================================================
def demonstrate_common_web_patterns():
"""
A reference of regex patterns commonly needed in web scraping.
"""
print("=" * 70)
print("ADVANCED: Common Regex Patterns for Web Scraping")
print("=" * 70)
# --- JSON embedded in HTML (window.__DATA__) ---
print("\n1. Extracting embedded JSON from HTML <script> tags")
print("-" * 50)
html = '''
<html><head>
<script>window.__INITIAL_DATA__ = {"user":"alice","count":42};</script>
</head></html>
'''
pattern = r'window\.__INITIAL_DATA__\s*=\s*({.*?});'
match = re.search(pattern, html)
if match:
data = json.loads(match.group(1))
print(f" Extracted: {data}")
print(f" user={data['user']}, count={data['count']}")
# --- HTML table rows ---
print(f"\n2. Extracting data from HTML tables")
print("-" * 50)
table_html = '''
<table>
<tr><td>Apple</td><td>$1.20</td><td>In Stock</td></tr>
<tr><td>Banana</td><td>$0.50</td><td>Low Stock</td></tr>
<tr><td>Cherry</td><td>$3.00</td><td>Out of Stock</td></tr>
</table>
'''
row_pattern = r'<tr><td>(.*?)</td><td>\$([\d.]+)</td><td>(.*?)</td></tr>'
rows = re.findall(row_pattern, table_html)
print(f" Found {len(rows)} rows:")
for name, price, status in rows:
print(f" {name}: ${price} ({status})")
# --- Coordinates from various formats ---
print(f"\n3. Extracting coordinates from various formats")
print("-" * 50)
texts = [
'Located at lat: 34.0522, lng: -118.2437',
'GPS: (40.7128, -74.0060)',
'"latitude":51.5074,"longitude":-0.1278',
'position="48.8566,2.3522"',
]
coord_patterns = [
(r'lat:\s*([\d.-]+),\s*lng:\s*([\d.-]+)', "lat/lng format"),
(r'\(([\d.-]+),\s*([\d.-]+)\)', "parenthesized format"),
(r'"latitude":([\d.-]+).*?"longitude":([\d.-]+)', "JSON format"),
(r'position="([\d.-]+),([\d.-]+)"', "attribute format"),
]
for text, (pattern, fmt) in zip(texts, coord_patterns):
match = re.search(pattern, text)
if match:
print(f" {fmt}: ({match.group(1)}, {match.group(2)})")
# --- Meta tags and Open Graph ---
print(f"\n4. Extracting meta tags from HTML")
print("-" * 50)
html_head = '''
<meta name="description" content="Best coffee shop in LA">
<meta property="og:title" content="Starbucks - Main St">
<meta property="og:image" content="https://example.com/store.jpg">
'''
meta_pattern = r'<meta\s+(?:name|property)="([^"]+)"\s+content="([^"]+)"'
metas = re.findall(meta_pattern, html_head)
for name, content in metas:
print(f" {name}: {content}")
# --- Pagination links ---
print(f"\n5. Extracting pagination info")
print("-" * 50)
pagination_html = '''
<a href="/stores?page=1">1</a>
<a href="/stores?page=2">2</a>
<a href="/stores?page=3" class="current">3</a>
<a href="/stores?page=4">4</a>
<a href="/stores?page=5">Next</a>
'''
page_pattern = r'href="/stores\?page=(\d+)"'
pages = re.findall(page_pattern, pagination_html)
print(f" Available pages: {pages}")
print(f" Last page: {max(int(p) for p in pages)}")
print("\n" + "=" * 70 + "\n")
# ============================================================================
# MAIN EXECUTION
# ============================================================================
def main():
"""Run all demonstrations."""
print("\n" + "=" * 70)
print(" " * 8 + "REGEX FOR WEB SCRAPING AND DATA EXTRACTION")
print(" " * 20 + "Complete Tutorial")
print("=" * 70 + "\n")
# Beginner
demonstrate_basic_field_extraction()
# Intermediate
records = demonstrate_multi_record_extraction()
unique_records = demonstrate_deduplication(records)
demonstrate_dataframe_construction(unique_records)
# Advanced
demonstrate_complete_pipeline()
demonstrate_common_web_patterns()
print("\n" + "=" * 70)
print("Tutorial Complete!")
print("=" * 70)
print("\nKey Takeaways:")
print("1. Use .*? (non-greedy) to extract quoted values")
print("2. Multi-group patterns extract several fields at once")
print("3. re.findall() with groups returns list of tuples")
print("4. Named groups (?P<name>...) improve readability")
print("5. Always deduplicate scraped data (use sets for O(n))")
print("6. Clean data before analysis: strip, normalize, convert types")
print("7. Pipeline: fetch → regex → clean → dedup → DataFrame → CSV")
print("8. re.DOTALL flag lets . match newlines in multi-line content")
print("=" * 70 + "\n")
if __name__ == "__main__":
main()