Practical Examples¶

Data Cleaning¶

Normalizing Whitespace¶

import re

raw = "  Hello,\t\tWorld!  \n  Too   many   spaces.  "

# Collapse all whitespace to single space, strip edges
cleaned = re.sub(r'\s+', ' ', raw).strip()
print(cleaned)
# 'Hello, World! Too many spaces.'

Removing Non-ASCII Characters¶

import re

text = "Café résumé naïve"

# Remove non-ASCII
re.sub(r'[^\x00-\x7F]+', '', text)
# 'Caf rsum nave'

# Keep only alphanumeric and basic punctuation
re.sub(r'[^a-zA-Z0-9\s.,!?-]', '', text)
# 'Caf rsum nave'

Cleaning Financial Data¶

import re

prices = ["\$1,234.56", "€ 2,100.00", "¥10,000", "\$42", "N/A"]

def extract_number(s):
    """Extract numeric value from a price string."""
    cleaned = re.sub(r'[^\d.]', '', s)
    if cleaned:
        return float(cleaned)
    return None

for p in prices:
    print(f"{p:>12} → {extract_number(p)}")
#    \$1,234.56 → 1234.56
#   € 2,100.00 → 2100.0
#      ¥10,000 → 10000.0
#          \$42 → 42.0
#          N/A → None

Validation Patterns¶

Email Validation¶

import re

# Simplified email pattern (covers most common formats)
EMAIL_RE = re.compile(r"""
    ^
    [\w.+-]+          # local part: word chars, dots, plus, hyphen
    @                 # at sign
    [\w-]+            # domain name
    (?:\.[\w-]+)*     # optional subdomains
    \.                # dot before TLD
    [a-zA-Z]{2,}      # TLD (at least 2 letters)
    $
""", re.VERBOSE)

tests = [
    "user@example.com",
    "first.last+tag@sub.domain.co.uk",
    "not-an-email",
    "@missing-local.com",
    "missing-domain@",
    "spaces not@allowed.com",
]

for t in tests:
    valid = "✓" if EMAIL_RE.match(t) else "✗"
    print(f"  {valid} {t}")
#   ✓ user@example.com
#   ✓ first.last+tag@sub.domain.co.uk
#   ✗ not-an-email
#   ✗ @missing-local.com
#   ✗ missing-domain@
#   ✗ spaces not@allowed.com

Date Validation¶

import re

DATE_RE = re.compile(r"""
    ^
    (?P<year>\d{4})     # year: 4 digits
    [-/]                # separator
    (?P<month>0[1-9]|1[0-2])   # month: 01-12
    [-/]                # separator
    (?P<day>0[1-9]|[12]\d|3[01])  # day: 01-31
    $
""", re.VERBOSE)

tests = ["2024-01-15", "2024/12/31", "2024-13-01", "2024-00-15", "24-01-15"]
for t in tests:
    m = DATE_RE.match(t)
    status = f"✓ {m.groupdict()}" if m else "✗"
    print(f"  {status} ← {t}")
#   ✓ {'year': '2024', 'month': '01', 'day': '15'} ← 2024-01-15
#   ✓ {'year': '2024', 'month': '12', 'day': '31'} ← 2024/12/31
#   ✗ ← 2024-13-01
#   ✗ ← 2024-00-15
#   ✗ ← 24-01-15

IP Address Validation¶

import re

# Match IPv4 addresses (0.0.0.0 to 255.255.255.255)
IP_RE = re.compile(r"""
    ^
    (?:
        (?:25[0-5]|2[0-4]\d|[01]?\d\d?)   # octet: 0-255
        \.                                   # dot
    ){3}
    (?:25[0-5]|2[0-4]\d|[01]?\d\d?)       # last octet (no trailing dot)
    $
""", re.VERBOSE)

tests = ["192.168.1.1", "255.255.255.255", "0.0.0.0", "256.1.1.1", "1.2.3"]
for t in tests:
    valid = "✓" if IP_RE.match(t) else "✗"
    print(f"  {valid} {t}")
#   ✓ 192.168.1.1
#   ✓ 255.255.255.255
#   ✓ 0.0.0.0
#   ✗ 256.1.1.1
#   ✗ 1.2.3

Log Parsing¶

Apache/Nginx Access Log¶

import re

LOG_RE = re.compile(r"""
    (?P<ip>[\d.]+)              # IP address
    \s+-\s+-\s+                 # identd and user (usually - -)
    \[(?P<date>[^\]]+)\]        # date in brackets
    \s+
    "(?P<method>\w+)            # HTTP method
    \s+(?P<path>\S+)            # request path
    \s+(?P<proto>[^"]+)"        # protocol
    \s+(?P<status>\d{3})        # status code
    \s+(?P<size>\d+|-)          # response size
""", re.VERBOSE)

log_line = '192.168.1.100 - - [15/Jan/2024:10:30:45 +0000] "GET /api/data HTTP/1.1" 200 1234'

match = LOG_RE.match(log_line)
if match:
    info = match.groupdict()
    for k, v in info.items():
        print(f"  {k:>8}: {v}")
#        ip: 192.168.1.100
#      date: 15/Jan/2024:10:30:45 +0000
#    method: GET
#      path: /api/data
#     proto: HTTP/1.1
#    status: 200
#      size: 1234

Python Traceback Extraction¶

import re

traceback_text = """
Traceback (most recent call last):
  File "main.py", line 42, in process_data
    result = compute(data)
  File "utils.py", line 15, in compute
    return data / 0
ZeroDivisionError: division by zero
"""

# Extract file, line number, and function name
FRAME_RE = re.compile(r'File "(?P<file>[^"]+)", line (?P<line>\d+), in (?P<func>\w+)')

for m in FRAME_RE.finditer(traceback_text):
    print(m.groupdict())
# {'file': 'main.py', 'line': '42', 'func': 'process_data'}
# {'file': 'utils.py', 'line': '15', 'func': 'compute'}

# Extract the exception type and message
exc_match = re.search(r'^(\w+Error): (.+)$', traceback_text, re.M)
if exc_match:
    print(f"Exception: {exc_match.group(1)}")  # ZeroDivisionError
    print(f"Message: {exc_match.group(2)}")     # division by zero

Text Transformation¶

Markdown to Plain Text¶

import re

md = """
# Main Title
This is **bold** and *italic* text.
Here's a [link](https://example.com) and `inline code`.
- List item 1
- List item 2
"""

def md_to_plain(text):
    text = re.sub(r'^#+\s+', '', text, flags=re.M)       # headers
    text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)         # bold
    text = re.sub(r'\*(.+?)\*', r'\1', text)              # italic
    text = re.sub(r'\[(.+?)\]\(.+?\)', r'\1', text)      # links
    text = re.sub(r'`(.+?)`', r'\1', text)                # inline code
    text = re.sub(r'^[-*]\s+', '• ', text, flags=re.M)   # list items
    return text.strip()

print(md_to_plain(md))
# Main Title
# This is bold and italic text.
# Here's a link and inline code.
# • List item 1
# • List item 2

CamelCase ↔ snake_case¶

import re

def camel_to_snake(name):
    """Convert camelCase or PascalCase to snake_case."""
    # Insert underscore before uppercase letters
    s = re.sub(r'(?<=[a-z0-9])([A-Z])', r'_\1', name)
    # Handle consecutive uppercase (e.g., HTTPResponse → http_response)
    s = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', s)
    return s.lower()

def snake_to_camel(name):
    """Convert snake_case to camelCase."""
    parts = name.split('_')
    return parts[0] + ''.join(p.capitalize() for p in parts[1:])

print(camel_to_snake("myVariableName"))    # my_variable_name
print(camel_to_snake("HTTPResponse"))       # http_response
print(camel_to_snake("getHTTPSUrl"))        # get_https_url

print(snake_to_camel("my_variable_name"))  # myVariableName
print(snake_to_camel("http_response"))      # httpResponse

Financial Data Patterns¶

Ticker and Price Extraction¶

import re

text = """
AAPL closed at \$182.63 (+1.25%)
GOOGL rose to \$141.80 (-0.34%)
MSFT ended at \$378.91 (+0.89%)
"""

STOCK_RE = re.compile(r'(?P<ticker>[A-Z]{1,5})\s+\w+\s+\w+\s+\$(?P<price>[\d.]+)\s+\((?P<change>[+-][\d.]+%)\)')

for m in STOCK_RE.finditer(text):
    d = m.groupdict()
    print(f"  {d['ticker']:>5}: ${d['price']} ({d['change']})")
#    AAPL: \$182.63 (+1.25%)
#   GOOGL: \$141.80 (-0.34%)
#    MSFT: \$378.91 (+0.89%)

CSV Line Parser¶

import re

def parse_csv_line(line):
    """Parse a CSV line handling quoted fields with commas."""
    pattern = r'(?:"([^"]*(?:""[^"]*)*)"|([^,]*))'
    fields = []
    for quoted, unquoted in re.findall(pattern, line):
        field = quoted.replace('""', '"') if quoted else unquoted
        fields.append(field.strip())
    return [f for f in fields if f or f == '']

line = 'John,"New York, NY",42,"He said ""hi"""'
print(parse_csv_line(line))
# ['John', 'New York, NY', '42', 'He said "hi"']

Common Regex Recipes¶

Task	Pattern
Integer	`r'[+-]?\d+'`
Float	`r'[+-]?\d*\.?\d+'`
Scientific notation	`r'[+-]?\d*\.?\d+(?:[eE][+-]?\d+)?'`
Hex color	`r'#[0-9a-fA-F]{3,6}\b'`
US phone	`r'$?\d{3}$?[-.\s]?\d{3}[-.\s]?\d{4}'`
URL	`r'https?://\S+'`
Email (simple)	`r'[\w.+-]+@[\w-]+\.[\w.]+'`
IP address	`r'\b(?:\d{1,3}\.){3}\d{1,3}\b'`
ISO date	`r'\d{4}-\d{2}-\d{2}'`
Blank lines	`r'^\s*$'` with `re.M`
Duplicate words	`r'\b(\w+)\s+\1\b'`
HTML tags	`r'<[^>]+>'`

Regex Is Not Always the Answer

For structured formats like JSON, XML, HTML, or CSV, prefer dedicated parsers (json, xml.etree, html.parser, csv modules). Regex is best for semi-structured text, validation, and simple extraction tasks.

Summary¶

Category	Key Takeaway
Data cleaning	`re.sub()` for normalization; extract numbers from messy strings
Validation	`re.fullmatch()` with verbose patterns for readable validators
Log parsing	Named groups + `finditer()` for structured extraction
Transformation	Backreferences in `re.sub()` for reordering and reformatting
Financial data	Combine named groups with specific numeric patterns
When NOT to use	Prefer dedicated parsers for JSON, XML, HTML, CSV

Runnable Example: `practical_applications_tutorial.py`¶

"""
Python Regular Expressions - Tutorial 07: Practical Applications
================================================================

This tutorial demonstrates real-world applications of regex patterns.

DIFFICULTY: ADVANCED
"""

import re

# =============================================================================
# Main
# =============================================================================

if __name__ == "__main__":

    print("="*70)
    print("REAL-WORLD REGEX PATTERNS")
    print("="*70)

    # Email validation
    email_pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
    emails = ["user@example.com", "invalid@", "test@test.co.uk"]
    print("\nEmail Validation:")
    for email in emails:
        valid = bool(re.match(email_pattern, email))
        print(f"  {'✓' if valid else '✗'} {email}")

    # URL validation
    url_pattern = r"^https?://(?:www\.)?[\w.-]+\.\w+(?:/[\w./?%&=-]*)?$"
    urls = ["https://example.com", "http://www.test.org/page", "invalid"]
    print("\nURL Validation:")
    for url in urls:
        valid = bool(re.match(url_pattern, url))
        print(f"  {'✓' if valid else '✗'} {url}")

    # Phone number formatting
    phone_text = "Call 5551234567 or 8005551234"
    phone_pattern = r"(\d{3})(\d{3})(\d{4})"
    formatted = re.sub(phone_pattern, r"() -", phone_text)
    print(f"\nPhone Formatting:\n  Before: {phone_text}\n  After: {formatted}")

    # Extract all links from HTML
    html = '<a href="http://example.com">Link</a> <a href="http://test.org">Test</a>'
    link_pattern = r'href="(https?://[^"]+)"'
    links = re.findall(link_pattern, html)
    print(f"\nExtracted Links: {links}")

    # Data extraction from logs
    log = "2024-03-15 14:30:45 ERROR Connection failed"
    log_pattern = r"(\d{4}-\d{2}-\d{2}) (\d{2}:\d{2}:\d{2}) (\w+) (.+)"
    match = re.search(log_pattern, log)
    if match:
        print(f"\nLog Parsing:")
        print(f"  Date: {match.group(1)}")
        print(f"  Time: {match.group(2)}")
        print(f"  Level: {match.group(3)}")
        print(f"  Message: {match.group(4)}")

    # Text cleaning
    text = "Hello!!!  How are   you???  "
    cleaned = re.sub(r'\s+', ' ', text).strip()
    cleaned = re.sub(r'([!?]){2,}', r'', cleaned)
    print(f"\nText Cleaning:\n  Before: '{text}'\n  After: '{cleaned}'")

    # IP address validation
    ip_pattern = r"^(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)$"
    ips = ["192.168.1.1", "256.1.1.1", "10.0.0.1"]
    print("\nIP Address Validation:")
    for ip in ips:
        valid = bool(re.match(ip_pattern, ip))
        print(f"  {'✓' if valid else '✗'} {ip}")

    print("\n" + "="*70)
    print("END OF TUTORIAL - All concepts completed!")
    print("="*70)

Runnable Example: `regex_web_scraping_tutorial.py`¶

"""
Topic 7.10 - Regex for Web Scraping and Data Extraction Tutorial

Real-world regex patterns for extracting structured data from
semi-structured web content (HTML, JSON-in-HTML, API responses).

When APIs aren't available, web scraping often requires parsing raw HTML
or embedded JSON to extract the data you need. Regex is the tool for this:
find patterns in messy text and pull out clean, structured fields.

This tutorial builds a complete pipeline:
  raw text → regex extraction → deduplication → pandas DataFrame → CSV

Inspired by a Starbucks store location scraper that parses store data
(ID, name, latitude, longitude, city, state, zip) from web responses.

Learning Objectives:
- Writing multi-group regex patterns for structured data extraction
- Extracting repeated records from large text blocks
- Non-greedy matching (.*?) to avoid over-matching
- Named groups for readability
- Deduplication patterns for scraped data
- Building DataFrames from extracted data
- Combining regex with pandas for data pipelines

Prerequisites:
- ch07/regex (regex basics, groups, quantifiers)
- ch10/core (pandas DataFrame basics)
- ch15/networking (HTTP requests, web scraping)

Author: Python Educator
Date: 2024
"""

import re
import csv
import json
from typing import Any


# ============================================================================
# PART 1: BEGINNER - Extracting Fields from Structured Text
# ============================================================================

def demonstrate_basic_field_extraction():
    """
    Extract key-value pairs from semi-structured text using regex groups.

    Web APIs and HTML pages often embed data in predictable formats.
    Regex groups let you extract specific fields from these patterns.
    """
    print("=" * 70)
    print("BEGINNER: Extracting Fields from Structured Text")
    print("=" * 70)

    # --- Extracting from JSON-like text ---
    print("\n1. Extracting from JSON-like text in HTML")
    print("-" * 50)

    # Simulate text embedded in a web page (common in store locators)
    html_fragment = '''
    <script>var storeData = {"storeNumber":"12345","name":"Downtown Plaza",
    "coordinates":{"latitude":34.0522,"longitude":-118.2437},
    "address":{"city":"Los Angeles","state":"CA","postalCode":"90012"}};
    </script>
    '''

    # Extract store number
    store_num = re.search(r'"storeNumber":"(\d+)"', html_fragment)
    if store_num:
        print(f"   Store number: {store_num.group(1)}")

    # Extract store name
    store_name = re.search(r'"name":"([^"]+)"', html_fragment)
    if store_name:
        print(f"   Store name  : {store_name.group(1)}")

    # Extract latitude and longitude
    lat = re.search(r'"latitude":([-\d.]+)', html_fragment)
    lon = re.search(r'"longitude":([-\d.]+)', html_fragment)
    if lat and lon:
        print(f"   Latitude    : {lat.group(1)}")
        print(f"   Longitude   : {lon.group(1)}")

    # Extract city
    city = re.search(r'"city":"([^"]+)"', html_fragment)
    if city:
        print(f"   City        : {city.group(1)}")

    # --- Non-greedy vs greedy matching ---
    print(f"\n2. Non-greedy matching: .*? vs .*")
    print("-" * 50)

    text = '"name":"First Store","other":"data","name":"Second Store"'

    # Greedy: .* grabs as much as possible
    greedy = re.search(r'"name":"(.*)"', text)
    print(f"   Greedy  .*  : {greedy.group(1) if greedy else 'None'}")
    # Gets: First Store","other":"data","name":"Second Store

    # Non-greedy: .*? grabs as little as possible
    nongreedy = re.search(r'"name":"(.*?)"', text)
    print(f"   Non-greedy .*? : {nongreedy.group(1) if nongreedy else 'None'}")
    # Gets: First Store

    print("\n   Key insight: Use .*? when extracting from quoted values")
    print("   to stop at the FIRST closing quote, not the last one.")

    # --- Named groups for readability ---
    print(f"\n3. Named groups (?P<name>...) for readability")
    print("-" * 50)

    log_line = '2024-03-15 14:30:45 [ERROR] Server: Connection refused (port 5432)'

    pattern = (
        r'(?P<date>\d{4}-\d{2}-\d{2}) '
        r'(?P<time>\d{2}:\d{2}:\d{2}) '
        r'\[(?P<level>\w+)\] '
        r'(?P<source>\w+): '
        r'(?P<message>.+)'
    )

    match = re.search(pattern, log_line)
    if match:
        print(f"   date    = {match.group('date')}")
        print(f"   time    = {match.group('time')}")
        print(f"   level   = {match.group('level')}")
        print(f"   source  = {match.group('source')}")
        print(f"   message = {match.group('message')}")
        print(f"\n   As dict: {match.groupdict()}")

    print("\n" + "=" * 70 + "\n")


# ============================================================================
# PART 2: INTERMEDIATE - Multi-Record Extraction from Web Content
# ============================================================================

def demonstrate_multi_record_extraction():
    """
    Extract multiple records from a block of text using regex.

    Web pages often contain many similar data records (store listings,
    product cards, search results). The strategy:
    1. Use re.findall() or re.finditer() to find all record boundaries
    2. Extract fields from each record with groups
    """
    print("=" * 70)
    print("INTERMEDIATE: Extracting Multiple Records")
    print("=" * 70)

    # Simulate a web response with multiple store records
    # This mimics what a store locator API might return embedded in HTML
    response_text = '''
    {"stores":[
        {"storeNumber":"10234","name":"Main St & 5th","coordinates":
         {"latitude":34.0522,"longitude":-118.2437},"address":
         {"city":"Los Angeles","countrySubdivisionCode":"CA","postalCode":"90012"},"slug":"main-st"},
        {"storeNumber":"10567","name":"Hollywood & Vine","coordinates":
         {"latitude":34.1017,"longitude":-118.3267},"address":
         {"city":"Hollywood","countrySubdivisionCode":"CA","postalCode":"90028"},"slug":"hollywood"},
        {"storeNumber":"10891","name":"Venice Boardwalk","coordinates":
         {"latitude":33.9850,"longitude":-118.4695},"address":
         {"city":"Venice","countrySubdivisionCode":"CA","postalCode":"90291"},"slug":"venice"},
        {"storeNumber":"10234","name":"Main St & 5th","coordinates":
         {"latitude":34.0522,"longitude":-118.2437},"address":
         {"city":"Los Angeles","countrySubdivisionCode":"CA","postalCode":"90012"},"slug":"duplicate"}
    ]}
    '''

    # --- Step 1: Split into individual records ---
    print("\n1. Finding individual records with re.findall()")
    print("-" * 50)

    # Each record runs from "storeNumber" to "slug"
    record_pattern = r'"storeNumber":.*?"slug"'
    records = re.findall(record_pattern, response_text, re.DOTALL)
    print(f"   Found {len(records)} records")

    # --- Step 2: Extract fields from each record ---
    print(f"\n2. Extracting fields with multi-group pattern")
    print("-" * 50)

    # This pattern uses 7 capturing groups to extract all fields at once
    field_pattern = (
        r'"storeNumber":"(.*?)"'       # Group 1: store ID
        r'.*?"name":"(.*?)"'           # Group 2: store name
        r'.*?"latitude":([\d.-]+)'     # Group 3: latitude
        r'.*?"longitude":([\d.-]+)'    # Group 4: longitude
        r'.*?"city":"(.*?)"'           # Group 5: city
        r'.*?"countrySubdivisionCode":"(.*?)"'  # Group 6: state
        r'.*?"postalCode":"(.*?)"'     # Group 7: zip code
    )

    extracted = []
    for record in records:
        match = re.search(field_pattern, record, re.DOTALL)
        if match:
            extracted.append(match.groups())
            store_id, name, lat, lon, city, state, zip_code = match.groups()
            print(f"   Store {store_id}: {name}")
            print(f"     Location: ({lat}, {lon})")
            print(f"     Address:  {city}, {state} {zip_code}")
            print()

    # --- Alternative: re.findall with groups returns list of tuples ---
    print("3. Shortcut: re.findall() with groups")
    print("-" * 50)

    # When the pattern has groups, findall returns list of tuples
    all_matches = re.findall(field_pattern, response_text, re.DOTALL)
    print(f"   re.findall returned {len(all_matches)} tuples")
    for m in all_matches:
        print(f"   {m[0]}: {m[1]} in {m[4]}, {m[5]}")

    print("\n" + "=" * 70 + "\n")
    return all_matches


# ============================================================================
# PART 3: INTERMEDIATE - Deduplication Patterns
# ============================================================================

def demonstrate_deduplication(records: list[tuple]) -> list[tuple]:
    """
    Deduplicate scraped records.

    Web scraping often produces duplicates because:
    - The same item appears on multiple pages
    - Search regions overlap (nearby zip codes)
    - Pagination issues

    Args:
        records: List of tuples (store_id, name, lat, lon, city, state, zip)

    Returns:
        Deduplicated list
    """
    print("=" * 70)
    print("INTERMEDIATE: Deduplication Patterns for Scraped Data")
    print("=" * 70)

    print(f"\n   Records before dedup: {len(records)}")

    # --- Method 1: Track seen IDs ---
    print(f"\n1. Method 1: Track seen IDs with a set")
    print("-" * 50)

    seen_ids = set()
    unique_records = []
    duplicates = 0

    for record in records:
        store_id = record[0]
        if store_id not in seen_ids:
            seen_ids.add(store_id)
            unique_records.append(record)
        else:
            duplicates += 1
            print(f"   Duplicate found: store {store_id} ({record[1]})")

    print(f"   After dedup: {len(unique_records)} unique, {duplicates} removed")

    # --- Method 2: dict.fromkeys() preserves order ---
    print(f"\n2. Method 2: dict keyed by ID (preserves insertion order)")
    print("-" * 50)

    store_dict = {}
    for record in records:
        store_id = record[0]
        if store_id not in store_dict:
            store_dict[store_id] = record

    unique_via_dict = list(store_dict.values())
    print(f"   Result: {len(unique_via_dict)} unique records")

    # --- Method 3: Using pandas (for large datasets) ---
    print(f"\n3. Method 3: pandas drop_duplicates (best for large data)")
    print("-" * 50)
    print("   import pandas as pd")
    print("   df = pd.DataFrame(records, columns=[...])")
    print("   df = df.drop_duplicates(subset=['store_id'])")
    print("   # Handles thousands of records efficiently")

    # --- Why sets work well here ---
    print(f"\n4. Why set-based dedup is O(n)")
    print("-" * 50)
    print("   set.add() and 'in set' are both O(1) average case")
    print("   Looping through n records: n × O(1) = O(n)")
    print("   vs. list-based 'if id in seen_list': O(n) per check → O(n²)")
    print("   For 10,000 records: set is ~10,000x faster than list!")

    print("\n" + "=" * 70 + "\n")
    return unique_records


# ============================================================================
# PART 4: INTERMEDIATE - Building a DataFrame from Scraped Data
# ============================================================================

def demonstrate_dataframe_construction(records: list[tuple]):
    """
    Convert extracted records into a structured pandas DataFrame.

    This is the typical final step in a scraping pipeline:
    raw text → regex → tuples → DataFrame → CSV/analysis
    """
    print("=" * 70)
    print("INTERMEDIATE: Building a DataFrame from Scraped Data")
    print("=" * 70)

    columns = ["store_id", "name", "latitude", "longitude",
               "city", "state", "zip_code"]

    # --- Method 1: From list of tuples ---
    print(f"\n1. DataFrame from list of tuples")
    print("-" * 50)

    # We'll use a simulated dataset here (real scraping would use actual data)
    sample_records = [
        ("10234", "Main St & 5th",       "34.0522", "-118.2437", "Los Angeles", "CA", "90012"),
        ("10567", "Hollywood & Vine",     "34.1017", "-118.3267", "Hollywood",   "CA", "90028"),
        ("10891", "Venice Boardwalk",     "33.9850", "-118.4695", "Venice",      "CA", "90291"),
        ("11023", "Pasadena Old Town",    "34.1478", "-118.1445", "Pasadena",    "CA", "91101"),
        ("11156", "Santa Monica Pier",    "34.0094", "-118.4973", "Santa Monica","CA", "90401"),
        ("11289", "Beverly Hills Plaza",  "34.0736", "-118.4004", "Beverly Hills","CA","90210"),
        ("11422", "Downtown Arts Dist",   "34.0402", "-118.2351", "Los Angeles", "CA", "90013"),
        ("11555", "Glendale Galleria",    "34.1456", "-118.2551", "Glendale",    "CA", "91210"),
        ("11688", "Burbank Media Dist",   "34.1808", "-118.3090", "Burbank",     "CA", "91502"),
        ("11821", "Long Beach Pike",      "33.7675", "-118.1924", "Long Beach",  "CA", "90802"),
    ]

    # Direct construction — most common approach
    # (importing pandas only if available, since this is a regex tutorial)
    try:
        import pandas as pd

        df = pd.DataFrame(sample_records, columns=columns)

        # Convert numeric columns from strings
        df["latitude"] = df["latitude"].astype(float)
        df["longitude"] = df["longitude"].astype(float)

        print(f"   Shape: {df.shape}")
        print(f"   Dtypes:\n{df.dtypes}")
        print(f"\n   Preview:")
        print(df[["store_id", "name", "city", "zip_code"]].to_string(index=False))

        # --- Quick analysis ---
        print(f"\n2. Quick analysis of scraped data")
        print("-" * 50)

        print(f"   Total stores    : {len(df)}")
        print(f"   Unique cities   : {df['city'].nunique()}")
        print(f"   Stores per city :")
        city_counts = df["city"].value_counts()
        for city, count in city_counts.items():
            print(f"     {city}: {count}")

        print(f"\n   Geographic bounds:")
        print(f"     Lat range : {df['latitude'].min():.4f} to {df['latitude'].max():.4f}")
        print(f"     Lon range : {df['longitude'].min():.4f} to {df['longitude'].max():.4f}")

        # --- Save to CSV ---
        print(f"\n3. Saving to CSV")
        print("-" * 50)

        csv_path = "/tmp/scraped_stores.csv"
        df.to_csv(csv_path, index=False)
        print(f"   Saved {len(df)} records to {csv_path}")

        # Verify by reading back
        df_check = pd.read_csv(csv_path)
        print(f"   Verified: read back {len(df_check)} records")

    except ImportError:
        print("   (pandas not available — showing CSV approach instead)")

        # Fallback: pure csv module
        csv_path = "/tmp/scraped_stores.csv"
        with open(csv_path, "w", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow(columns)
            writer.writerows(sample_records)
        print(f"   Wrote {len(sample_records)} records to {csv_path}")

    print("\n" + "=" * 70 + "\n")


# ============================================================================
# PART 5: ADVANCED - Complete Regex Scraping Pipeline
# ============================================================================

def demonstrate_complete_pipeline():
    """
    A complete pipeline that combines all the patterns:
    1. Simulate fetching a web response
    2. Extract records with regex
    3. Clean and normalize fields
    4. Deduplicate
    5. Build DataFrame
    6. Export to CSV

    This mirrors how real web scrapers work.
    """
    print("=" * 70)
    print("ADVANCED: Complete Regex Scraping Pipeline")
    print("=" * 70)

    # --- Step 1: Simulated web response ---
    # In practice, this comes from requests.get(url).text
    print("\n--- Step 1: Receive web response ---")

    response_text = '''
    window.__INITIAL_STATE__ = {"stores":{"items":[
    {"storeNumber":"10234","name":"Main St \\u0026 5th Ave","coordinates":
     {"latitude":34.0522,"longitude":-118.2437},"address":
     {"city":"Los Angeles","countrySubdivisionCode":"CA",
      "postalCode":"90012-1234"},"slug":"main-st"},
    {"storeNumber":"10567","name":"Hollywood \\u0026 Vine","coordinates":
     {"latitude":34.1017,"longitude":-118.3267},"address":
     {"city":"Hollywood ","countrySubdivisionCode":"CA",
      "postalCode":"90028"},"slug":"hw-vine"},
    {"storeNumber":"10891","name":"Venice  Boardwalk","coordinates":
     {"latitude":33.9850,"longitude":-118.4695},"address":
     {"city":"Venice","countrySubdivisionCode":"CA",
      "postalCode":"90291"},"slug":"venice-bw"},
    {"storeNumber":"10234","name":"Main St \\u0026 5th Ave","coordinates":
     {"latitude":34.0522,"longitude":-118.2437},"address":
     {"city":"Los Angeles","countrySubdivisionCode":"CA",
      "postalCode":"90012-1234"},"slug":"main-st-dup"}
    ]}};
    '''

    print(f"   Response length: {len(response_text)} chars")

    # --- Step 2: Extract records ---
    print("\n--- Step 2: Extract records with regex ---")

    record_pattern = r'"storeNumber":.*?"slug"'
    raw_records = re.findall(record_pattern, response_text, re.DOTALL)
    print(f"   Found {len(raw_records)} raw records")

    field_pattern = (
        r'"storeNumber":"(?P<id>.*?)"'
        r'.*?"name":"(?P<name>.*?)"'
        r'.*?"latitude":(?P<lat>[\d.-]+)'
        r'.*?"longitude":(?P<lon>[\d.-]+)'
        r'.*?"city":"(?P<city>.*?)"'
        r'.*?"countrySubdivisionCode":"(?P<state>.*?)"'
        r'.*?"postalCode":"(?P<zip>.*?)"'
    )

    extracted = []
    for record in raw_records:
        match = re.search(field_pattern, record, re.DOTALL)
        if match:
            extracted.append(match.groupdict())

    print(f"   Extracted {len(extracted)} records")
    for rec in extracted:
        print(f"   {rec['id']}: {rec['name']} — {rec['city']}, {rec['state']}")

    # --- Step 3: Clean and normalize ---
    print("\n--- Step 3: Clean and normalize fields ---")

    def clean_record(record: dict) -> dict:
        """
        Clean a single scraped record.

        Common cleaning tasks:
        - Decode unicode escapes (\\u0026 → &)
        - Strip whitespace
        - Normalize zip codes (take first 5 digits)
        - Collapse multiple spaces
        """
        cleaned = {}
        cleaned["store_id"] = record["id"].strip()
        # Decode unicode escapes like \u0026
        cleaned["name"] = (
            record["name"]
            .encode("utf-8")
            .decode("unicode_escape")
            .strip()
        )
        # Collapse multiple spaces
        cleaned["name"] = re.sub(r"\s+", " ", cleaned["name"])
        cleaned["latitude"] = float(record["lat"])
        cleaned["longitude"] = float(record["lon"])
        cleaned["city"] = record["city"].strip()
        cleaned["state"] = record["state"].strip()
        # Normalize zip to 5 digits
        cleaned["zip_code"] = record["zip"][:5]
        return cleaned

    cleaned_records = [clean_record(r) for r in extracted]
    print(f"   Cleaned {len(cleaned_records)} records")
    for rec in cleaned_records:
        print(f"   {rec['store_id']}: {rec['name']} — {rec['city']}, "
              f"{rec['state']} {rec['zip_code']}")

    # --- Step 4: Deduplicate ---
    print("\n--- Step 4: Deduplicate ---")

    seen = set()
    unique = []
    for rec in cleaned_records:
        if rec["store_id"] not in seen:
            seen.add(rec["store_id"])
            unique.append(rec)
        else:
            print(f"   Removed duplicate: {rec['store_id']} ({rec['name']})")

    print(f"   {len(cleaned_records)} → {len(unique)} after dedup")

    # --- Step 5: Build DataFrame ---
    print("\n--- Step 5: Build DataFrame ---")

    try:
        import pandas as pd
        df = pd.DataFrame(unique)
        print(df.to_string(index=False))
    except ImportError:
        for rec in unique:
            print(f"   {rec}")

    # --- Step 6: Save to CSV ---
    print("\n--- Step 6: Save to CSV ---")

    csv_path = "/tmp/scraped_stores_pipeline.csv"
    fieldnames = ["store_id", "name", "latitude", "longitude",
                   "city", "state", "zip_code"]

    with open(csv_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(unique)

    print(f"   Saved {len(unique)} records to {csv_path}")

    print("\n" + "=" * 70 + "\n")


# ============================================================================
# PART 6: ADVANCED - Regex Patterns for Common Web Formats
# ============================================================================

def demonstrate_common_web_patterns():
    """
    A reference of regex patterns commonly needed in web scraping.
    """
    print("=" * 70)
    print("ADVANCED: Common Regex Patterns for Web Scraping")
    print("=" * 70)

    # --- JSON embedded in HTML (window.__DATA__) ---
    print("\n1. Extracting embedded JSON from HTML <script> tags")
    print("-" * 50)

    html = '''
    <html><head>
    <script>window.__INITIAL_DATA__ = {"user":"alice","count":42};</script>
    </head></html>
    '''

    pattern = r'window\.__INITIAL_DATA__\s*=\s*({.*?});'
    match = re.search(pattern, html)
    if match:
        data = json.loads(match.group(1))
        print(f"   Extracted: {data}")
        print(f"   user={data['user']}, count={data['count']}")

    # --- HTML table rows ---
    print(f"\n2. Extracting data from HTML tables")
    print("-" * 50)

    table_html = '''
    <table>
    <tr><td>Apple</td><td>$1.20</td><td>In Stock</td></tr>
    <tr><td>Banana</td><td>$0.50</td><td>Low Stock</td></tr>
    <tr><td>Cherry</td><td>$3.00</td><td>Out of Stock</td></tr>
    </table>
    '''

    row_pattern = r'<tr><td>(.*?)</td><td>\$([\d.]+)</td><td>(.*?)</td></tr>'
    rows = re.findall(row_pattern, table_html)
    print(f"   Found {len(rows)} rows:")
    for name, price, status in rows:
        print(f"   {name}: ${price} ({status})")

    # --- Coordinates from various formats ---
    print(f"\n3. Extracting coordinates from various formats")
    print("-" * 50)

    texts = [
        'Located at lat: 34.0522, lng: -118.2437',
        'GPS: (40.7128, -74.0060)',
        '"latitude":51.5074,"longitude":-0.1278',
        'position="48.8566,2.3522"',
    ]

    coord_patterns = [
        (r'lat:\s*([\d.-]+),\s*lng:\s*([\d.-]+)', "lat/lng format"),
        (r'\(([\d.-]+),\s*([\d.-]+)\)', "parenthesized format"),
        (r'"latitude":([\d.-]+).*?"longitude":([\d.-]+)', "JSON format"),
        (r'position="([\d.-]+),([\d.-]+)"', "attribute format"),
    ]

    for text, (pattern, fmt) in zip(texts, coord_patterns):
        match = re.search(pattern, text)
        if match:
            print(f"   {fmt}: ({match.group(1)}, {match.group(2)})")

    # --- Meta tags and Open Graph ---
    print(f"\n4. Extracting meta tags from HTML")
    print("-" * 50)

    html_head = '''
    <meta name="description" content="Best coffee shop in LA">
    <meta property="og:title" content="Starbucks - Main St">
    <meta property="og:image" content="https://example.com/store.jpg">
    '''

    meta_pattern = r'<meta\s+(?:name|property)="([^"]+)"\s+content="([^"]+)"'
    metas = re.findall(meta_pattern, html_head)
    for name, content in metas:
        print(f"   {name}: {content}")

    # --- Pagination links ---
    print(f"\n5. Extracting pagination info")
    print("-" * 50)

    pagination_html = '''
    <a href="/stores?page=1">1</a>
    <a href="/stores?page=2">2</a>
    <a href="/stores?page=3" class="current">3</a>
    <a href="/stores?page=4">4</a>
    <a href="/stores?page=5">Next</a>
    '''

    page_pattern = r'href="/stores\?page=(\d+)"'
    pages = re.findall(page_pattern, pagination_html)
    print(f"   Available pages: {pages}")
    print(f"   Last page: {max(int(p) for p in pages)}")

    print("\n" + "=" * 70 + "\n")


# ============================================================================
# MAIN EXECUTION
# ============================================================================

def main():
    """Run all demonstrations."""
    print("\n" + "=" * 70)
    print(" " * 8 + "REGEX FOR WEB SCRAPING AND DATA EXTRACTION")
    print(" " * 20 + "Complete Tutorial")
    print("=" * 70 + "\n")

    # Beginner
    demonstrate_basic_field_extraction()

    # Intermediate
    records = demonstrate_multi_record_extraction()
    unique_records = demonstrate_deduplication(records)
    demonstrate_dataframe_construction(unique_records)

    # Advanced
    demonstrate_complete_pipeline()
    demonstrate_common_web_patterns()

    print("\n" + "=" * 70)
    print("Tutorial Complete!")
    print("=" * 70)
    print("\nKey Takeaways:")
    print("1. Use .*? (non-greedy) to extract quoted values")
    print("2. Multi-group patterns extract several fields at once")
    print("3. re.findall() with groups returns list of tuples")
    print("4. Named groups (?P<name>...) improve readability")
    print("5. Always deduplicate scraped data (use sets for O(n))")
    print("6. Clean data before analysis: strip, normalize, convert types")
    print("7. Pipeline: fetch → regex → clean → dedup → DataFrame → CSV")
    print("8. re.DOTALL flag lets . match newlines in multi-line content")
    print("=" * 70 + "\n")


if __name__ == "__main__":
    main()