Vectorization¶

Vectorization is the practice of applying operations to entire arrays at once, rather than iterating through elements. Vectorized operations in pandas are typically 10-100x faster than loops.

Why Vectorization Matters¶

The Problem with Loops¶

import pandas as pd
import numpy as np
import time

df = pd.DataFrame({
    'A': np.random.randn(100000),
    'B': np.random.randn(100000)
})

# Slow: iterating with a loop
def slow_sum(df):
    result = []
    for i in range(len(df)):
        result.append(df.iloc[i]['A'] + df.iloc[i]['B'])
    return result

# Even slower: iterrows
def slower_sum(df):
    result = []
    for idx, row in df.iterrows():
        result.append(row['A'] + row['B'])
    return result

The Vectorized Solution¶

# Fast: vectorized operation
df['C'] = df['A'] + df['B']

Performance Comparison¶

n = 100000
df = pd.DataFrame({
    'A': np.random.randn(n),
    'B': np.random.randn(n)
})

# Method 1: iterrows (slowest)
start = time.time()
result = []
for idx, row in df.iterrows():
    result.append(row['A'] + row['B'])
iterrows_time = time.time() - start

# Method 2: apply (slow)
start = time.time()
result = df.apply(lambda row: row['A'] + row['B'], axis=1)
apply_time = time.time() - start

# Method 3: vectorized (fast)
start = time.time()
result = df['A'] + df['B']
vector_time = time.time() - start

print(f"iterrows: {iterrows_time:.3f}s")
print(f"apply:    {apply_time:.3f}s")
print(f"vectorized: {vector_time:.6f}s")

Typical results:

iterrows: 4.521s
apply:    1.234s
vectorized: 0.001s

Common Vectorized Operations¶

Arithmetic¶

# Element-wise operations
df['sum'] = df['A'] + df['B']
df['diff'] = df['A'] - df['B']
df['product'] = df['A'] * df['B']
df['ratio'] = df['A'] / df['B']
df['power'] = df['A'] ** 2

Comparisons¶

# Returns boolean Series
mask = df['A'] > 0
mask = df['A'] >= df['B']
mask = (df['A'] > 0) & (df['B'] < 0)
mask = (df['A'] > 0) | (df['B'] > 0)

String Operations (with .str accessor)¶

df = pd.DataFrame({'text': ['hello', 'world', 'python']})

# Vectorized string operations
df['upper'] = df['text'].str.upper()
df['length'] = df['text'].str.len()
df['contains_o'] = df['text'].str.contains('o')

Datetime Operations (with .dt accessor)¶

df = pd.DataFrame({'date': pd.date_range('2024-01-01', periods=100)})

# Vectorized datetime operations
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['weekday'] = df['date'].dt.dayofweek

Replacing Loops with Vectorization¶

Conditional Assignment¶

# Instead of:
for i in range(len(df)):
    if df.loc[i, 'A'] > 0:
        df.loc[i, 'result'] = 'positive'
    else:
        df.loc[i, 'result'] = 'negative'

# Use:
df['result'] = np.where(df['A'] > 0, 'positive', 'negative')

Multiple Conditions¶

# Instead of complex if-elif-else in loop:
conditions = [
    df['A'] > 1,
    df['A'] > 0,
    df['A'] > -1
]
choices = ['high', 'medium', 'low']
df['category'] = np.select(conditions, choices, default='very_low')

Cumulative Operations¶

# Instead of loop with running total:
df['cumsum'] = df['A'].cumsum()
df['cumprod'] = df['A'].cumprod()
df['cummax'] = df['A'].cummax()
df['cummin'] = df['A'].cummin()

Shifting and Differencing¶

# Instead of loop comparing to previous row:
df['prev_A'] = df['A'].shift(1)
df['change'] = df['A'].diff()
df['pct_change'] = df['A'].pct_change()

Rolling Operations¶

# Instead of loop calculating moving average:
df['rolling_mean'] = df['A'].rolling(window=5).mean()
df['rolling_std'] = df['A'].rolling(window=5).std()
df['rolling_sum'] = df['A'].rolling(window=5).sum()

When apply() Is Acceptable¶

Sometimes apply() is necessary, but optimize the function:

# Slow: complex logic in apply
def complex_calculation(row):
    if row['A'] > 0 and row['B'] > 0:
        return row['A'] * row['B']
    elif row['A'] < 0 and row['B'] < 0:
        return -row['A'] * row['B']
    else:
        return 0

# Faster: vectorize the logic
mask1 = (df['A'] > 0) & (df['B'] > 0)
mask2 = (df['A'] < 0) & (df['B'] < 0)

df['result'] = 0  # default
df.loc[mask1, 'result'] = df.loc[mask1, 'A'] * df.loc[mask1, 'B']
df.loc[mask2, 'result'] = -df.loc[mask2, 'A'] * df.loc[mask2, 'B']

Using NumPy Functions¶

NumPy functions work directly on pandas objects:

# Math functions
df['abs_A'] = np.abs(df['A'])
df['sqrt_abs'] = np.sqrt(np.abs(df['A']))
df['log_abs'] = np.log(np.abs(df['A']) + 1)
df['exp_A'] = np.exp(df['A'])

# Trigonometric
df['sin_A'] = np.sin(df['A'])
df['cos_A'] = np.cos(df['A'])

# Rounding
df['floor'] = np.floor(df['A'])
df['ceil'] = np.ceil(df['A'])
df['round'] = np.round(df['A'], 2)

Practical Example: Financial Calculations¶

# Stock price data
np.random.seed(42)
prices = pd.DataFrame({
    'close': 100 + np.cumsum(np.random.randn(10000) * 0.5)
})

# All vectorized calculations
prices['return'] = prices['close'].pct_change()
prices['log_return'] = np.log(prices['close'] / prices['close'].shift(1))
prices['sma_20'] = prices['close'].rolling(20).mean()
prices['sma_50'] = prices['close'].rolling(50).mean()
prices['std_20'] = prices['return'].rolling(20).std()
prices['upper_band'] = prices['sma_20'] + 2 * prices['std_20'] * prices['close']
prices['lower_band'] = prices['sma_20'] - 2 * prices['std_20'] * prices['close']
prices['signal'] = np.where(prices['sma_20'] > prices['sma_50'], 1, -1)

Performance Tips¶

1. Avoid iterrows()¶

# Never do this for large DataFrames
for idx, row in df.iterrows():
    # process row
    pass

2. Minimize apply()¶

# Try to replace apply with vectorized operations
# Bad
df['result'] = df.apply(lambda x: x['A'] + x['B'], axis=1)

# Good
df['result'] = df['A'] + df['B']

3. Use eval() for Complex Expressions¶

# For complex arithmetic on large DataFrames
df.eval('result = (A + B) * (C - D) / E', inplace=True)

4. Batch Operations¶

# Instead of multiple separate operations
df['A'] = df['A'] * 2
df['B'] = df['B'] + 1
df['C'] = df['A'] + df['B']

# Consider eval for batch
df.eval('''
    A = A * 2
    B = B + 1
    C = A + B
''', inplace=True)

Summary¶

Method	Speed	Use Case
Vectorized ops	⚡ Fastest	Arithmetic, comparisons
NumPy functions	⚡ Fast	Math operations
eval()	⚡ Fast	Complex expressions
apply(axis=0)	🔶 Moderate	Column-wise operations
apply(axis=1)	🔴 Slow	Row-wise operations
iterrows()	🔴 Slowest	Avoid if possible

Rule of thumb: If you're writing a loop over DataFrame rows, there's almost always a vectorized alternative.

Runnable Example: `performance_tutorial.py`¶

"""
Pandas Tutorial: Performance Optimization.

Covers techniques for faster data processing.
"""

import pandas as pd
import numpy as np
import time

# =============================================================================
# Main
# =============================================================================

if __name__ == "__main__":

    print("="*70)
    print("PERFORMANCE OPTIMIZATION")
    print("="*70)

    # Create large dataset
    n = 100000
    np.random.seed(42)
    df = pd.DataFrame({
        'A': np.random.randint(0, 100, n),
        'B': np.random.randint(0, 100, n),
        'C': np.random.choice(['X', 'Y', 'Z'], n),
        'D': np.random.random(n)
    })

    print(f"\nDataFrame with {len(df):,} rows")
    print(df.head())

    # 1. Vectorized operations vs loops
    print("\n1. Vectorized operations vs loops:")

    # Loop (slow)
    start = time.time()
    result_loop = []
    for val in df['A']:
        result_loop.append(val * 2)
    loop_time = time.time() - start

    # Vectorized (fast)
    start = time.time()
    result_vec = df['A'] * 2
    vec_time = time.time() - start

    print(f"Loop time: {loop_time:.4f}s")
    print(f"Vectorized time: {vec_time:.4f}s")
    print(f"Speedup: {loop_time/vec_time:.2f}x")

    # 2. Use categorical for repeated strings
    print("\n2. Memory optimization with categorical:")
    print(f"Original memory: {df['C'].memory_usage(deep=True):,} bytes")
    df['C_cat'] = df['C'].astype('category')
    print(f"Categorical memory: {df['C_cat'].memory_usage(deep=True):,} bytes")

    # 3. Query method (faster than boolean indexing for large datasets)
    print("\n3. Query method:")
    start = time.time()
    result1 = df[(df['A'] > 50) & (df['B'] < 30)]
    bool_time = time.time() - start

    start = time.time()
    result2 = df.query('A > 50 and B < 30')
    query_time = time.time() - start

    print(f"Boolean indexing: {bool_time:.4f}s")
    print(f"Query method: {query_time:.4f}s")

    # 4. Use appropriate dtypes
    print("\n4. Optimize data types:")
    df_types = pd.DataFrame({
        'int_col': [1, 2, 3, 4, 5] * 20000,
        'float_col': [1.5, 2.5, 3.5] * 33334
    })

    print("\nBefore optimization:")
    print(df_types.dtypes)
    print(f"Memory: {df_types.memory_usage(deep=True).sum():,} bytes")

    # Downcast to smaller types
    df_types['int_col'] = pd.to_numeric(df_types['int_col'], downcast='integer')
    df_types['float_col'] = pd.to_numeric(df_types['float_col'], downcast='float')

    print("\nAfter optimization:")
    print(df_types.dtypes)
    print(f"Memory: {df_types.memory_usage(deep=True).sum():,} bytes")

    # 5. Avoid chained indexing
    print("\n5. Avoid chained indexing:")
    print("❌ Bad: df[df['A'] > 50]['B'] = 100  (chained)")
    print("✅ Good: df.loc[df['A'] > 50, 'B'] = 100")

    # 6. Use inplace when appropriate (though not always faster)
    print("\n6. Inplace operations:")
    df_temp = df.copy()
    start = time.time()
    df_temp = df_temp.drop(columns=['D'])
    drop_time = time.time() - start

    df_temp2 = df.copy()
    start = time.time()
    df_temp2.drop(columns=['D'], inplace=True)
    inplace_time = time.time() - start

    print(f"Regular drop: {drop_time:.4f}s")
    print(f"Inplace drop: {inplace_time:.4f}s")

    print("\nKEY TAKEAWAYS:")
    print("1. Use vectorized operations instead of loops")
    print("2. Convert repeated strings to categorical")
    print("3. Use query() for complex boolean indexing")
    print("4. Optimize data types (downcast integers/floats)")
    print("5. Avoid chained indexing - use loc/iloc")
    print("6. Consider chunking for very large files")
    print("7. Use eval() for complex expressions")
    print("8. Avoid apply() when vectorization is possible")