Memory Efficiency with Categoricals¶
One of the primary benefits of categorical data is dramatic memory savings. This document demonstrates the memory characteristics and optimization strategies.
How Memory is Saved¶
String Storage (object dtype)¶
Each string value is stored separately in memory, even if repeated:
import pandas as pd
import numpy as np
# 1 million rows with 10 unique sectors
sectors = ['Technology', 'Finance', 'Healthcare', 'Retail', 'Energy',
'Utilities', 'Media', 'Aerospace', 'Banks', 'Insurance']
np.random.seed(42)
data = np.random.choice(sectors, size=1_000_000)
# String storage
s_string = pd.Series(data)
string_memory = s_string.memory_usage(deep=True)
print(f"String storage: {string_memory / 1e6:.2f} MB")
String storage: 57.89 MB
Categorical Storage¶
Categories are stored once; data stores only integer codes:
# Categorical storage
s_cat = s_string.astype('category')
cat_memory = s_cat.memory_usage(deep=True)
print(f"Categorical storage: {cat_memory / 1e6:.2f} MB")
print(f"Memory reduction: {string_memory / cat_memory:.1f}x")
Categorical storage: 1.00 MB
Memory reduction: 57.9x
Memory Breakdown¶
def analyze_categorical_memory(s_cat):
"""Analyze memory components of a categorical Series."""
# Category table size
categories = s_cat.cat.categories
cat_memory = categories.memory_usage(deep=True)
# Codes array size (integer array)
codes_memory = s_cat.cat.codes.nbytes
# Total
total = s_cat.memory_usage(deep=True)
print(f"Categories ({len(categories)} unique): {cat_memory:,} bytes")
print(f"Codes ({len(s_cat):,} values): {codes_memory:,} bytes")
print(f"Total: {total:,} bytes")
return cat_memory, codes_memory, total
s_cat = pd.Series(np.random.choice(['A', 'B', 'C'], 1_000_000), dtype='category')
analyze_categorical_memory(s_cat)
Categories (3 unique): 248 bytes
Codes (1,000,000 values): 1,000,000 bytes
Total: 1,000,376 bytes
Memory Comparison Table¶
def compare_memory(n_rows, n_categories, avg_string_length=10):
"""Compare string vs categorical memory usage."""
# Generate data
categories = [f'Cat_{i:0{len(str(n_categories))}d}' for i in range(n_categories)]
data = np.random.choice(categories, n_rows)
# String
s_string = pd.Series(data)
string_mem = s_string.memory_usage(deep=True)
# Categorical
s_cat = s_string.astype('category')
cat_mem = s_cat.memory_usage(deep=True)
return string_mem, cat_mem, string_mem / cat_mem
# Test different scenarios
scenarios = [
(100_000, 5),
(100_000, 50),
(100_000, 500),
(1_000_000, 10),
(1_000_000, 100),
(1_000_000, 1000),
]
print(f"{'Rows':>12} {'Categories':>12} {'String MB':>12} {'Cat MB':>12} {'Ratio':>8}")
print("-" * 60)
for n_rows, n_cats in scenarios:
str_mem, cat_mem, ratio = compare_memory(n_rows, n_cats)
print(f"{n_rows:>12,} {n_cats:>12} {str_mem/1e6:>12.2f} {cat_mem/1e6:>12.2f} {ratio:>8.1f}x")
Rows Categories String MB Cat MB Ratio
------------------------------------------------------------
100,000 5 5.79 0.10 57.9x
100,000 50 5.79 0.11 52.6x
100,000 500 6.30 0.15 42.0x
1,000,000 10 57.89 1.00 57.9x
1,000,000 100 57.89 1.01 57.3x
1,000,000 1000 62.89 1.10 57.2x
When Categoricals Save Memory¶
High Savings (Use Categorical)¶
- Few unique values relative to total rows
- Long string values
- Many repeated values
# Ideal case: 1M rows, 10 categories, long strings
countries = ['United States of America', 'United Kingdom', 'Germany',
'France', 'Japan', 'China', 'India', 'Brazil', 'Canada', 'Australia']
data = np.random.choice(countries, 1_000_000)
s_string = pd.Series(data)
s_cat = s_string.astype('category')
print(f"String: {s_string.memory_usage(deep=True) / 1e6:.1f} MB")
print(f"Categorical: {s_cat.memory_usage(deep=True) / 1e6:.1f} MB")
Low Savings (May Not Be Worth It)¶
- Many unique values (high cardinality)
- Short strings
- Few rows
# Poor case: many unique values
unique_ids = [f'ID_{i}' for i in range(100_000)] # All unique
s_string = pd.Series(unique_ids)
s_cat = s_string.astype('category')
print(f"String: {s_string.memory_usage(deep=True) / 1e6:.1f} MB")
print(f"Categorical: {s_cat.memory_usage(deep=True) / 1e6:.1f} MB")
# Similar or worse for high cardinality
Integer Code Sizes¶
Pandas automatically chooses the smallest integer type for codes:
| Number of Categories | Code Type | Bytes per Value |
|---|---|---|
| ≤ 127 | int8 | 1 |
| ≤ 32,767 | int16 | 2 |
| ≤ 2,147,483,647 | int32 | 4 |
| > 2,147,483,647 | int64 | 8 |
# Few categories -> int8
s = pd.Series(['a', 'b', 'c'] * 1000, dtype='category')
print(f"3 categories: {s.cat.codes.dtype}") # int8
# Many categories -> int16
cats = [f'cat_{i}' for i in range(200)]
s = pd.Series(np.random.choice(cats, 1000), dtype='category')
print(f"200 categories: {s.cat.codes.dtype}") # int16
DataFrame Memory Optimization¶
def optimize_dataframe(df, verbose=True):
"""Convert low-cardinality string columns to categorical."""
original_memory = df.memory_usage(deep=True).sum()
for col in df.select_dtypes(include=['object']).columns:
n_unique = df[col].nunique()
n_total = len(df)
# Convert if less than 50% unique values
if n_unique / n_total < 0.5:
df[col] = df[col].astype('category')
if verbose:
print(f"Converted '{col}': {n_unique} unique values")
new_memory = df.memory_usage(deep=True).sum()
if verbose:
print(f"\nMemory: {original_memory/1e6:.1f} MB → {new_memory/1e6:.1f} MB")
print(f"Reduction: {(1 - new_memory/original_memory)*100:.1f}%")
return df
# Example usage
df = pd.DataFrame({
'sector': np.random.choice(['Tech', 'Finance', 'Health'], 100_000),
'rating': np.random.choice(['A', 'B', 'C', 'D'], 100_000),
'id': [f'ID_{i}' for i in range(100_000)], # High cardinality - won't convert
'value': np.random.randn(100_000)
})
df = optimize_dataframe(df)
Converted 'sector': 3 unique values
Converted 'rating': 4 unique values
Memory: 14.2 MB → 2.1 MB
Reduction: 85.2%
Real-World Example: S&P 500 Data¶
# Simulate S&P 500 historical data
np.random.seed(42)
sectors = ['Technology', 'Healthcare', 'Finance', 'Energy',
'Consumer Discretionary', 'Consumer Staples',
'Industrials', 'Materials', 'Utilities',
'Real Estate', 'Communication Services']
tickers = [f'STOCK_{i:03d}' for i in range(500)]
dates = pd.date_range('2020-01-01', '2024-01-01', freq='B')
# Create large dataset
n_rows = len(tickers) * len(dates)
df = pd.DataFrame({
'date': np.tile(dates, len(tickers)),
'ticker': np.repeat(tickers, len(dates)),
'sector': np.repeat(np.random.choice(sectors, len(tickers)), len(dates)),
'close': np.random.randn(n_rows).cumsum() + 100,
'volume': np.random.randint(1000, 1000000, n_rows)
})
print(f"Dataset size: {len(df):,} rows")
print(f"\nBefore optimization:")
print(df.memory_usage(deep=True))
print(f"Total: {df.memory_usage(deep=True).sum() / 1e6:.1f} MB")
# Optimize
df['ticker'] = df['ticker'].astype('category')
df['sector'] = df['sector'].astype('category')
print(f"\nAfter optimization:")
print(df.memory_usage(deep=True))
print(f"Total: {df.memory_usage(deep=True).sum() / 1e6:.1f} MB")
Guidelines¶
| Unique Values (% of rows) | Recommendation |
|---|---|
| < 1% | ✅ Definitely use categorical |
| 1-10% | ✅ Use categorical |
| 10-50% | ⚠️ Test both options |
| > 50% | ❌ Probably not beneficial |
Performance vs Memory Trade-off¶
Converting to categorical has a small upfront cost but saves memory and speeds up operations:
import time
# Large dataset
n = 5_000_000
sectors = ['A', 'B', 'C', 'D', 'E']
data = np.random.choice(sectors, n)
# Conversion time
start = time.time()
s_cat = pd.Series(data, dtype='category')
conv_time = time.time() - start
print(f"Conversion time: {conv_time:.2f}s")
# GroupBy comparison
s_string = pd.Series(data)
values = np.random.randn(n)
start = time.time()
pd.Series(values).groupby(s_string).mean()
string_groupby = time.time() - start
start = time.time()
pd.Series(values).groupby(s_cat).mean()
cat_groupby = time.time() - start
print(f"String groupby: {string_groupby:.3f}s")
print(f"Categorical groupby: {cat_groupby:.3f}s")
print(f"Speedup: {string_groupby/cat_groupby:.1f}x")