Memory Optimization¶

Efficient memory usage is critical when working with large datasets. This document covers strategies to reduce memory consumption in pandas.

Understanding Memory Consumption¶

Check Current Memory¶

import pandas as pd
import numpy as np

df = pd.DataFrame({
    'int_col': np.random.randint(0, 100, 1_000_000),
    'float_col': np.random.randn(1_000_000),
    'str_col': np.random.choice(['A', 'B', 'C', 'D', 'E'], 1_000_000)
})

# Total memory (accurate)
total_mb = df.memory_usage(deep=True).sum() / 1e6
print(f"Total: {total_mb:.1f} MB")

# Per-column breakdown
print(df.memory_usage(deep=True))

Strategy 1: Downcast Numeric Types¶

Pandas defaults to int64 and float64, but smaller types often suffice.

Integer Downcasting¶

def downcast_integers(df):
    """Downcast integer columns to smallest type."""
    for col in df.select_dtypes(include=['int64']).columns:
        c_min, c_max = df[col].min(), df[col].max()

        if c_min >= 0:  # Unsigned
            if c_max <= 255:
                df[col] = df[col].astype('uint8')
            elif c_max <= 65535:
                df[col] = df[col].astype('uint16')
            elif c_max <= 4294967295:
                df[col] = df[col].astype('uint32')
        else:  # Signed
            if c_min >= -128 and c_max <= 127:
                df[col] = df[col].astype('int8')
            elif c_min >= -32768 and c_max <= 32767:
                df[col] = df[col].astype('int16')
            elif c_min >= -2147483648 and c_max <= 2147483647:
                df[col] = df[col].astype('int32')

    return df

Float Downcasting¶

def downcast_floats(df):
    """Downcast float columns to float32."""
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = df[col].astype('float32')
    return df

Using pd.to_numeric with downcast¶

# Automatic downcasting
df['int_col'] = pd.to_numeric(df['int_col'], downcast='integer')
df['float_col'] = pd.to_numeric(df['float_col'], downcast='float')

Strategy 2: Use Categorical for Strings¶

String columns with repeated values benefit enormously from categorical encoding.

When to Use Categorical¶

Condition	Recommendation
Unique values < 50% of rows	✅ Use categorical
Few unique values, many rows	✅ Use categorical
All unique values	❌ No benefit
Need string methods	⚠️ Convert back first

Conversion Example¶

# Before: string storage
print(f"String: {df['str_col'].memory_usage(deep=True) / 1e6:.2f} MB")

# After: categorical storage
df['str_col'] = df['str_col'].astype('category')
print(f"Categorical: {df['str_col'].memory_usage(deep=True) / 1e6:.2f} MB")

Typical result:

String: 57.00 MB
Categorical: 1.00 MB

Automatic Detection¶

def convert_to_categorical(df, threshold=0.5):
    """Convert low-cardinality object columns to categorical."""
    for col in df.select_dtypes(include=['object']).columns:
        ratio = df[col].nunique() / len(df)
        if ratio < threshold:
            df[col] = df[col].astype('category')
            print(f"Converted {col}: {df[col].nunique()} unique values")
    return df

Strategy 3: Specify dtypes at Load Time¶

Loading data with optimized types is more efficient than converting after.

read_csv with dtype¶

# Define optimal types upfront
dtypes = {
    'id': 'int32',
    'value': 'float32',
    'category': 'category',
    'flag': 'bool'
}

df = pd.read_csv('data.csv', dtype=dtypes)

CategoricalDtype for Ordered Categories¶

rating_dtype = pd.CategoricalDtype(
    categories=['D', 'C', 'B', 'A', 'S'],
    ordered=True
)

df = pd.read_csv('data.csv', dtype={'rating': rating_dtype})

Strategy 4: Sparse Data¶

For data with many repeated values (often zeros), use sparse types.

# Dense array: stores every value
dense = pd.Series([0, 0, 0, 1, 0, 0, 0, 0, 2, 0] * 100000)
print(f"Dense: {dense.memory_usage() / 1e6:.2f} MB")

# Sparse array: stores only non-default values
sparse = dense.astype('Sparse[int64]')
print(f"Sparse: {sparse.memory_usage() / 1e6:.2f} MB")

Complete Optimization Function¶

def optimize_dataframe(df, verbose=True):
    """Optimize DataFrame memory usage."""
    initial_mem = df.memory_usage(deep=True).sum()

    # Downcast integers
    for col in df.select_dtypes(include=['int64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='integer')

    # Downcast floats
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='float')

    # Convert low-cardinality strings to categorical
    for col in df.select_dtypes(include=['object']).columns:
        if df[col].nunique() / len(df) < 0.5:
            df[col] = df[col].astype('category')

    final_mem = df.memory_usage(deep=True).sum()

    if verbose:
        print(f"Memory: {initial_mem/1e6:.1f} MB → {final_mem/1e6:.1f} MB")
        print(f"Reduction: {(1 - final_mem/initial_mem)*100:.1f}%")

    return df

Practical Example: Stock Data¶

# Simulated stock data
np.random.seed(42)
n = 2_000_000

df = pd.DataFrame({
    'date': pd.date_range('2000-01-01', periods=n, freq='T'),
    'ticker': np.random.choice(['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'META'], n),
    'open': np.random.uniform(100, 200, n),
    'high': np.random.uniform(100, 200, n),
    'low': np.random.uniform(100, 200, n),
    'close': np.random.uniform(100, 200, n),
    'volume': np.random.randint(1000, 1000000, n)
})

print("Before optimization:")
print(df.info(memory_usage='deep'))

# Optimize
df = optimize_dataframe(df)

print("\nAfter optimization:")
print(df.info(memory_usage='deep'))
print(df.dtypes)

Memory Monitoring During Processing¶

def memory_checkpoint(df, label):
    """Print memory usage at checkpoint."""
    mem = df.memory_usage(deep=True).sum() / 1e6
    print(f"[{label}] Memory: {mem:.1f} MB")

# Usage
memory_checkpoint(df, "Start")

df['new_col'] = df['close'] * df['volume']
memory_checkpoint(df, "After calculation")

df = df[df['volume'] > 10000]
memory_checkpoint(df, "After filtering")

Summary Table¶

Strategy	Typical Savings	Use Case
Downcast integers	50-87%	All numeric data
Downcast floats	50%	Float columns
Categorical	90%+	Repeated strings
Sparse	Varies	Many repeated values
Specify dtype at load	Avoids conversion	All data loading

Best Practices¶

Profile first: Use memory_usage(deep=True) to find big columns
Optimize at load time: Specify dtypes in read_csv()
Use categorical: For columns with < 50% unique values
Downcast numerics: Use smallest type that fits your data
Monitor growth: Check memory after transformations
Drop unused columns: Remove columns you don't need
Process in chunks: For files too large for RAM