Skip to content

GroupBy Aggregations

GroupBy objects support various aggregation methods to summarize grouped data.

Basic Aggregations

Apply single aggregation functions.

1. Mean per Group

import pandas as pd

df = pd.DataFrame({
    'asset': ['A', 'A', 'B', 'B'],
    'return': [0.01, -0.02, 0.03, 0.01],
})

df.groupby('asset')['return'].mean()
asset
A    -0.005
B     0.020
Name: return, dtype: float64

2. Common Aggregations

df.groupby('asset')['return'].sum()
df.groupby('asset')['return'].count()
df.groupby('asset')['return'].std()
df.groupby('asset')['return'].min()
df.groupby('asset')['return'].max()

3. Multiple Methods

df.groupby('asset')['return'].agg(['mean', 'std', 'count'])

LeetCode Example: Duplicate Emails

Count email occurrences.

1. Sample Data

person = pd.DataFrame({
    'id': [1, 2, 3, 4, 5],
    'email': ['a@ex.com', 'b@ex.com', 'a@ex.com', 'b@ex.com', 'c@ex.com']
})

2. GroupBy Count

email_counts = person.groupby('email')['id'].count().reset_index(name='count')
print(email_counts)
      email  count
0  a@ex.com      2
1  b@ex.com      2
2  c@ex.com      1

3. Find Duplicates

duplicates = email_counts[email_counts['count'] > 1]['email']

LeetCode Example: Customer Orders

Find customer with most orders.

1. Sample Data

orders = pd.DataFrame({
    'order_number': [101, 102, 103, 104, 105],
    'customer_number': [1, 1, 2, 3, 2]
})

2. Count per Customer

order_counts = orders.groupby('customer_number')['order_number'].count().reset_index()
print(order_counts)

3. Find Maximum

max_orders = order_counts.loc[order_counts['order_number'].idxmax()]

Named Aggregations

Create descriptive column names.

1. Named Syntax

df.groupby('asset').agg(
    mean_return=('return', 'mean'),
    std_return=('return', 'std'),
    count=('return', 'count')
)

2. Dictionary Syntax

df.groupby('asset').agg({
    'return': ['mean', 'std', 'count']
})

3. Custom Functions

df.groupby('asset').agg(
    range=('return', lambda x: x.max() - x.min())
)

reset_index

Convert index to columns.

1. Default Result

result = df.groupby('asset')['return'].mean()
# asset is index

2. With reset_index

result = df.groupby('asset')['return'].mean().reset_index()
# asset is column

3. as_index=False

result = df.groupby('asset', as_index=False)['return'].mean()
# Equivalent to reset_index()

Runnable Example: business_analytics_example.py

"""
Business Analytics: GroupBy, Pivot Tables, and Visualization

A practical business analytics workflow using Pandas groupby,
pivot_table, and crosstab for multi-dimensional analysis.

Topics covered:
- GroupBy with multiple aggregation functions
- Pivot tables for cross-tabulation
- Month-over-month (MoM) calculations
- Percent-of-total calculations
- Summary statistics by category

Based on Python-100-Days Day66-80 day05.ipynb business analytics examples.
"""

import numpy as np
import pandas as pd


# =============================================================================
# Setup: Generate Sample Business Data
# =============================================================================

def create_sales_data() -> pd.DataFrame:
    """Create sample sales data for analytics."""
    np.random.seed(42)
    n = 200

    data = {
        'date': pd.date_range('2023-01-01', periods=n, freq='B'),
        'region': np.random.choice(['East', 'West', 'North', 'South'], n),
        'channel': np.random.choice(['Online', 'Retail', 'Wholesale'], n,
                                     p=[0.4, 0.35, 0.25]),
        'product': np.random.choice(['Widget A', 'Widget B', 'Widget C'], n),
        'units': np.random.randint(10, 200, n),
        'unit_price': np.random.choice([25.0, 45.0, 80.0], n),
    }
    df = pd.DataFrame(data)
    df['revenue'] = df['units'] * df['unit_price']
    df['cost'] = df['revenue'] * np.random.uniform(0.4, 0.7, n)
    df['profit'] = df['revenue'] - df['cost']
    df['month'] = df['date'].dt.to_period('M')
    return df


# =============================================================================
# Analysis 1: Monthly Revenue Summary
# =============================================================================

def monthly_summary(df: pd.DataFrame) -> None:
    """Summarize revenue by month with MoM growth."""
    print("=== Monthly Revenue Summary ===")

    monthly = df.groupby('month').agg(
        total_revenue=('revenue', 'sum'),
        total_profit=('profit', 'sum'),
        num_orders=('revenue', 'count'),
        avg_order_value=('revenue', 'mean'),
    ).round(0)

    # Month-over-month growth
    monthly['revenue_mom'] = monthly['total_revenue'].pct_change()
    monthly['profit_margin'] = monthly['total_profit'] / monthly['total_revenue']

    print(monthly.to_string())
    print()


# =============================================================================
# Analysis 2: Region x Channel Pivot Table
# =============================================================================

def region_channel_analysis(df: pd.DataFrame) -> None:
    """Pivot table: revenue by region and channel."""
    print("=== Revenue by Region x Channel ===")

    pivot = pd.pivot_table(
        df,
        values='revenue',
        index='region',
        columns='channel',
        aggfunc='sum',
        margins=True,           # Add row/column totals
        margins_name='Total',
    ).round(0)

    print(pivot.to_string())
    print()

    # Percent of total
    print("--- Percent of Total ---")
    total = pivot.loc['Total', 'Total']
    pct = (pivot / total * 100).round(1)
    print(pct.to_string())
    print()


# =============================================================================
# Analysis 3: Product Performance
# =============================================================================

def product_performance(df: pd.DataFrame) -> None:
    """Analyze performance by product."""
    print("=== Product Performance ===")

    product_stats = df.groupby('product').agg(
        total_units=('units', 'sum'),
        total_revenue=('revenue', 'sum'),
        total_profit=('profit', 'sum'),
        avg_price=('unit_price', 'mean'),
        num_orders=('revenue', 'count'),
    )
    product_stats['profit_margin'] = (
        product_stats['total_profit'] / product_stats['total_revenue']
    )
    product_stats['revenue_per_order'] = (
        product_stats['total_revenue'] / product_stats['num_orders']
    )

    # Sort by total revenue
    product_stats = product_stats.sort_values('total_revenue', ascending=False)

    print(product_stats.round(1).to_string())
    print()


# =============================================================================
# Analysis 4: Cross-Tabulation
# =============================================================================

def channel_region_crosstab(df: pd.DataFrame) -> None:
    """Cross-tabulation of order counts by channel and region."""
    print("=== Order Count: Channel x Region (crosstab) ===")

    ct = pd.crosstab(
        df['channel'],
        df['region'],
        margins=True,
        margins_name='Total',
    )
    print(ct.to_string())
    print()

    # Normalize by row (channel distribution across regions)
    print("--- Channel Distribution Across Regions (%) ---")
    ct_pct = pd.crosstab(
        df['channel'],
        df['region'],
        normalize='index',
    ).round(3) * 100
    print(ct_pct.to_string())
    print()


# =============================================================================
# Analysis 5: Multi-Level Grouping
# =============================================================================

def multi_level_grouping(df: pd.DataFrame) -> None:
    """Group by multiple columns with multiple aggregations."""
    print("=== Multi-Level Grouping: Region + Product ===")

    result = df.groupby(['region', 'product']).agg({
        'revenue': ['sum', 'mean', 'count'],
        'profit': ['sum'],
        'units': ['sum'],
    }).round(0)

    # Flatten column MultiIndex
    result.columns = ['_'.join(col).strip() for col in result.columns]

    # Top 5 by revenue
    print("Top 5 by Total Revenue:")
    print(result.nlargest(5, 'revenue_sum').to_string())
    print()


# =============================================================================
# Analysis 6: Transform and Rank within Groups
# =============================================================================

def group_transform_rank(df: pd.DataFrame) -> None:
    """Use transform and rank within groups."""
    print("=== Transform: Percent of Regional Revenue ===")

    # Each order's revenue as % of its region's total
    df_copy = df[['region', 'channel', 'revenue']].copy()
    df_copy['region_total'] = df_copy.groupby('region')['revenue'].transform('sum')
    df_copy['pct_of_region'] = (df_copy['revenue'] / df_copy['region_total'] * 100).round(2)

    # Rank within region
    df_copy['rank_in_region'] = df_copy.groupby('region')['revenue'].rank(
        ascending=False, method='dense'
    ).astype(int)

    print(df_copy.head(10).to_string(index=False))
    print()


# =============================================================================
# Main
# =============================================================================

if __name__ == '__main__':
    df = create_sales_data()
    print(f"Dataset: {df.shape[0]} rows, {df.shape[1]} columns")
    print(f"Date range: {df['date'].min().date()} to {df['date'].max().date()}")
    print(f"Columns: {df.columns.tolist()}")
    print()

    monthly_summary(df)
    region_channel_analysis(df)
    product_performance(df)
    channel_region_crosstab(df)
    multi_level_grouping(df)
    group_transform_rank(df)