Building Panel DataFrames¶

This document covers how to construct panel data structures in pandas using MultiIndex.

Using MultiIndex.from_product¶

Creates a panel with all combinations of entities and time periods (balanced panel).

import pandas as pd
import numpy as np

# Define dimensions
tickers = ['AAPL', 'MSFT', 'GOOGL']
dates = pd.date_range('2024-01-01', periods=5)

# Create MultiIndex (Cartesian product)
index = pd.MultiIndex.from_product(
    [tickers, dates], 
    names=['ticker', 'date']
)

# Create panel DataFrame
np.random.seed(42)
n = len(index)

df = pd.DataFrame({
    'return': np.random.randn(n) * 0.02,
    'volume': np.random.randint(1000, 10000, n),
    'close': 100 + np.random.randn(n).cumsum()
}, index=index)

print(df.head(10))

                     return  volume       close
ticker date                                    
AAPL   2024-01-01  0.009934    5765  100.496714
       2024-01-02 -0.002765    6274   99.354071
       2024-01-03  0.012936    2627  100.001572
       2024-01-04  0.030486    8019  102.283551
       2024-01-05 -0.004675    3927  101.396356
MSFT   2024-01-01 -0.003129    3109  101.542560
       2024-01-02  0.015335    2239  100.533340
       2024-01-03  0.000296    9596  100.070105
       2024-01-04 -0.002013    8394   99.634291
       2024-01-05  0.002826    8259  100.421411

Using MultiIndex.from_tuples¶

Useful when you have specific entity-time combinations (unbalanced panel).

# Specific combinations only
tuples = [
    ('AAPL', '2024-01-01'),
    ('AAPL', '2024-01-02'),
    ('AAPL', '2024-01-03'),
    ('MSFT', '2024-01-02'),  # MSFT starts later
    ('MSFT', '2024-01-03'),
]

index = pd.MultiIndex.from_tuples(tuples, names=['ticker', 'date'])

df = pd.DataFrame({
    'return': [0.01, 0.02, -0.01, 0.015, 0.008]
}, index=index)

print(df)

Using MultiIndex.from_arrays¶

When you have separate arrays for each level:

tickers = ['AAPL', 'AAPL', 'AAPL', 'MSFT', 'MSFT', 'MSFT']
dates = ['2024-01-01', '2024-01-02', '2024-01-03'] * 2

index = pd.MultiIndex.from_arrays(
    [tickers, dates], 
    names=['ticker', 'date']
)

df = pd.DataFrame({'return': [0.01, 0.02, -0.01, 0.015, 0.008, 0.012]}, index=index)
print(df)

From Long-Format DataFrame¶

Convert a regular DataFrame to panel format:

# Long format data (common from databases)
data = pd.DataFrame({
    'ticker': ['AAPL', 'AAPL', 'MSFT', 'MSFT', 'GOOGL', 'GOOGL'],
    'date': pd.to_datetime(['2024-01-01', '2024-01-02'] * 3),
    'return': [0.01, 0.02, 0.015, 0.018, 0.008, 0.012],
    'volume': [1000, 1100, 2000, 2100, 1500, 1600]
})

print("Long format:")
print(data)
print()

# Convert to panel (MultiIndex)
panel = data.set_index(['ticker', 'date'])
print("Panel format:")
print(panel)

From Wide-Format DataFrame¶

Convert wide format (tickers as columns) to panel:

# Wide format
wide = pd.DataFrame({
    'AAPL': [0.01, 0.02, -0.01],
    'MSFT': [0.015, 0.018, 0.012],
    'GOOGL': [0.008, 0.012, 0.009]
}, index=pd.date_range('2024-01-01', periods=3))
wide.index.name = 'date'
wide.columns.name = 'ticker'

print("Wide format:")
print(wide)
print()

# Convert to panel using stack
panel = wide.stack()
panel.name = 'return'
print("Panel format (Series):")
print(panel)
print()

# Or swap levels to have ticker first
panel = panel.swaplevel().sort_index()
print("Panel format (ticker first):")
print(panel)

From Real Financial Data¶

import yfinance as yf

# Download multiple stocks
tickers = ['AAPL', 'MSFT', 'GOOGL']
start = '2024-01-01'
end = '2024-01-31'

# Download returns long-form data
dfs = []
for ticker in tickers:
    data = yf.Ticker(ticker).history(start=start, end=end)
    data['ticker'] = ticker
    data = data.reset_index()[['Date', 'ticker', 'Close', 'Volume']]
    dfs.append(data)

# Combine and set MultiIndex
panel = pd.concat(dfs, ignore_index=True)
panel = panel.rename(columns={'Date': 'date', 'Close': 'close', 'Volume': 'volume'})
panel = panel.set_index(['ticker', 'date']).sort_index()

print(panel.head(15))

Building with Specific Structure¶

Sector-Stock-Date Hierarchy¶

# Three-level panel
sectors = ['Tech', 'Finance']
tickers = {
    'Tech': ['AAPL', 'MSFT'],
    'Finance': ['JPM', 'BAC']
}
dates = pd.date_range('2024-01-01', periods=3)

# Build tuples
tuples = []
for sector, stocks in tickers.items():
    for stock in stocks:
        for date in dates:
            tuples.append((sector, stock, date))

index = pd.MultiIndex.from_tuples(tuples, names=['sector', 'ticker', 'date'])

df = pd.DataFrame({
    'return': np.random.randn(len(index)) * 0.02
}, index=index)

print(df)

Verifying Panel Structure¶

# Check panel dimensions
print(f"Index levels: {df.index.names}")
print(f"Number of levels: {df.index.nlevels}")
print(f"Shape: {df.shape}")

# Check balance
print(f"\nEntities: {df.index.get_level_values('ticker').unique()}")
print(f"Time periods: {df.index.get_level_values('date').unique()}")

# Check for balanced panel
def is_balanced(df, entity_level='ticker', time_level='date'):
    n_entities = df.index.get_level_values(entity_level).nunique()
    n_times = df.index.get_level_values(time_level).nunique()
    expected = n_entities * n_times
    actual = len(df)
    return actual == expected

print(f"\nBalanced panel: {is_balanced(df)}")

Best Practices¶

Name your index levels for clarity
Sort the index after creation for performance
Use appropriate dtypes (datetime for dates, category for tickers)
Verify balance if your analysis requires it

# Good practice example
index = pd.MultiIndex.from_product(
    [tickers, dates],
    names=['ticker', 'date']  # Named levels
)

df = pd.DataFrame({'return': data}, index=index)
df = df.sort_index()  # Sorted for performance
df.index = df.index.set_levels(
    df.index.levels[0].astype('category'),  # Categorical ticker
    level=0
)