Skip to content

Kernel Density Estimation with imshow

Kernel Density Estimation (KDE) provides a smooth estimate of the probability density function from sample data. This document covers how to visualize 2D KDE using imshow.

What is KDE?

KDE estimates the probability density function by placing a kernel (typically Gaussian) at each data point and summing them:

\[\hat{f}(x) = \frac{1}{nh} \sum_{i=1}^{n} K\left(\frac{x - x_i}{h}\right)\]

where: - \(K\) is the kernel function - \(h\) is the bandwidth (smoothing parameter) - \(n\) is the number of data points

Setup

import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

np.random.seed(42)

Basic 2D KDE

1. Generate Sample Data

# Generate bivariate data
n = 500
mean = [0, 0]
cov = [[1, 0.5], [0.5, 1]]
data = np.random.multivariate_normal(mean, cov, n)

x = data[:, 0]
y = data[:, 1]

fig, ax = plt.subplots()
ax.scatter(x, y, alpha=0.5, s=10)
ax.set_title('Raw Data Points')
ax.set_xlabel('x')
ax.set_ylabel('y')
plt.show()

2. Compute KDE

# Create grid
xmin, xmax = x.min() - 1, x.max() + 1
ymin, ymax = y.min() - 1, y.max() + 1
xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([xx.ravel(), yy.ravel()])

# Compute KDE
kernel = stats.gaussian_kde(np.vstack([x, y]))
density = kernel(positions).reshape(xx.shape)

3. Visualize with imshow

fig, ax = plt.subplots(figsize=(8, 6))
im = ax.imshow(
    density.T,
    extent=[xmin, xmax, ymin, ymax],
    origin='lower',
    cmap='viridis',
    aspect='auto'
)
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title('2D Kernel Density Estimation')
plt.colorbar(im, ax=ax, label='Density')
plt.show()

KDE with Data Points Overlay

1. Basic Overlay

fig, ax = plt.subplots(figsize=(8, 6))

# KDE background
im = ax.imshow(
    density.T,
    extent=[xmin, xmax, ymin, ymax],
    origin='lower',
    cmap='Blues',
    aspect='auto'
)

# Scatter points
ax.scatter(x, y, c='red', s=5, alpha=0.3)

ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title('KDE with Data Points')
plt.colorbar(im, ax=ax, label='Density')
plt.show()

2. Transparent KDE

fig, ax = plt.subplots(figsize=(8, 6))

ax.scatter(x, y, c='black', s=10, alpha=0.5, label='Data')
im = ax.imshow(
    density.T,
    extent=[xmin, xmax, ymin, ymax],
    origin='lower',
    cmap='Reds',
    aspect='auto',
    alpha=0.6
)

ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title('Transparent KDE Overlay')
plt.colorbar(im, ax=ax, label='Density')
plt.show()

Bandwidth Selection

The bandwidth parameter controls smoothing. Larger values = smoother estimates.

1. Bandwidth Comparison

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

bandwidths = [0.1, 0.3, 0.8]

for ax, bw in zip(axes, bandwidths):
    kernel = stats.gaussian_kde(np.vstack([x, y]), bw_method=bw)
    density = kernel(positions).reshape(xx.shape)

    im = ax.imshow(
        density.T,
        extent=[xmin, xmax, ymin, ymax],
        origin='lower',
        cmap='viridis',
        aspect='auto'
    )
    ax.set_title(f'Bandwidth = {bw}')
    plt.colorbar(im, ax=ax)

plt.suptitle('Effect of Bandwidth on KDE', fontsize=14)
plt.tight_layout()
plt.show()

2. Scott's Rule vs Silverman's Rule

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

methods = ['scott', 'silverman', 0.3]
titles = ["Scott's Rule", "Silverman's Rule", "Manual (0.3)"]

for ax, method, title in zip(axes, methods, titles):
    kernel = stats.gaussian_kde(np.vstack([x, y]), bw_method=method)
    density = kernel(positions).reshape(xx.shape)

    im = ax.imshow(
        density.T,
        extent=[xmin, xmax, ymin, ymax],
        origin='lower',
        cmap='viridis',
        aspect='auto'
    )
    ax.set_title(title)
    plt.colorbar(im, ax=ax)

plt.tight_layout()
plt.show()

Different Distributions

1. Clustered Data

# Generate clustered data
np.random.seed(42)
n1 = 300
n2 = 200

cluster1 = np.random.multivariate_normal([2, 2], [[0.5, 0], [0, 0.5]], n1)
cluster2 = np.random.multivariate_normal([-1, -1], [[0.3, 0.2], [0.2, 0.3]], n2)
data_clustered = np.vstack([cluster1, cluster2])

x_c = data_clustered[:, 0]
y_c = data_clustered[:, 1]

# Compute KDE
xmin, xmax = x_c.min() - 1, x_c.max() + 1
ymin, ymax = y_c.min() - 1, y_c.max() + 1
xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([xx.ravel(), yy.ravel()])

kernel = stats.gaussian_kde(np.vstack([x_c, y_c]))
density = kernel(positions).reshape(xx.shape)

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Scatter
axes[0].scatter(x_c, y_c, alpha=0.5, s=10)
axes[0].set_title('Scatter Plot')

# KDE
im = axes[1].imshow(
    density.T,
    extent=[xmin, xmax, ymin, ymax],
    origin='lower',
    cmap='hot',
    aspect='auto'
)
axes[1].set_title('KDE')
plt.colorbar(im, ax=axes[1])

plt.suptitle('Clustered Data', fontsize=14)
plt.tight_layout()
plt.show()

2. Ring Distribution

# Generate ring-shaped data
n = 1000
theta = np.random.uniform(0, 2 * np.pi, n)
r = np.random.normal(2, 0.2, n)
x_ring = r * np.cos(theta)
y_ring = r * np.sin(theta)

# Compute KDE
xmin, xmax = -4, 4
ymin, ymax = -4, 4
xx, yy = np.mgrid[xmin:xmax:150j, ymin:ymax:150j]
positions = np.vstack([xx.ravel(), yy.ravel()])

kernel = stats.gaussian_kde(np.vstack([x_ring, y_ring]), bw_method=0.15)
density = kernel(positions).reshape(xx.shape)

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

axes[0].scatter(x_ring, y_ring, alpha=0.3, s=5)
axes[0].set_aspect('equal')
axes[0].set_title('Scatter Plot')

im = axes[1].imshow(
    density.T,
    extent=[xmin, xmax, ymin, ymax],
    origin='lower',
    cmap='magma',
    aspect='equal'
)
axes[1].set_title('KDE')
plt.colorbar(im, ax=axes[1])

plt.suptitle('Ring Distribution', fontsize=14)
plt.tight_layout()
plt.show()

Comparison Methods

KDE vs hist2d vs hexbin

np.random.seed(42)
n = 2000
x = np.random.normal(0, 1, n)
y = x + np.random.normal(0, 0.5, n)

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# hist2d
h = axes[0].hist2d(x, y, bins=30, cmap='viridis')
axes[0].set_title('hist2d')
plt.colorbar(h[3], ax=axes[0])

# hexbin
hb = axes[1].hexbin(x, y, gridsize=20, cmap='viridis')
axes[1].set_title('hexbin')
plt.colorbar(hb, ax=axes[1])

# KDE
xmin, xmax = x.min() - 1, x.max() + 1
ymin, ymax = y.min() - 1, y.max() + 1
xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([xx.ravel(), yy.ravel()])
kernel = stats.gaussian_kde(np.vstack([x, y]))
density = kernel(positions).reshape(xx.shape)

im = axes[2].imshow(
    density.T,
    extent=[xmin, xmax, ymin, ymax],
    origin='lower',
    cmap='viridis',
    aspect='auto'
)
axes[2].set_title('KDE (imshow)')
plt.colorbar(im, ax=axes[2])

plt.suptitle('Density Visualization Methods', fontsize=14)
plt.tight_layout()
plt.show()

Method Comparison Table

Method Pros Cons
hist2d Fast, simple Bin edge artifacts
hexbin Better packing, no axis alignment Less familiar
KDE Smooth, accurate Computationally expensive

Combining KDE with Contours

1. KDE with Contour Lines

fig, ax = plt.subplots(figsize=(8, 6))

im = ax.imshow(
    density.T,
    extent=[xmin, xmax, ymin, ymax],
    origin='lower',
    cmap='Blues',
    aspect='auto'
)

# Add contour lines
cs = ax.contour(
    xx, yy, density,
    levels=5,
    colors='darkblue',
    linewidths=0.8
)
ax.clabel(cs, inline=True, fontsize=8, fmt='%.3f')

ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title('KDE with Contour Lines')
plt.colorbar(im, ax=ax, label='Density')
plt.show()

2. KDE with Percentile Contours

# Calculate percentile levels
levels_pct = np.percentile(density.ravel(), [10, 25, 50, 75, 90])

fig, ax = plt.subplots(figsize=(8, 6))

im = ax.imshow(
    density.T,
    extent=[xmin, xmax, ymin, ymax],
    origin='lower',
    cmap='YlOrRd',
    aspect='auto'
)

cs = ax.contour(xx, yy, density, levels=levels_pct, colors='black', linewidths=0.8)

ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title('KDE with Percentile Contours')
plt.colorbar(im, ax=ax, label='Density')
plt.show()

Marginal Distributions

KDE with Marginal Histograms

from mpl_toolkits.axes_grid1 import make_axes_locatable

np.random.seed(42)
n = 500
data = np.random.multivariate_normal([0, 0], [[1, 0.5], [0.5, 1]], n)
x, y = data[:, 0], data[:, 1]

# Compute KDE
xmin, xmax = x.min() - 0.5, x.max() + 0.5
ymin, ymax = y.min() - 0.5, y.max() + 0.5
xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([xx.ravel(), yy.ravel()])
kernel = stats.gaussian_kde(np.vstack([x, y]))
density = kernel(positions).reshape(xx.shape)

# Create figure with marginals
fig, ax_main = plt.subplots(figsize=(8, 8))

divider = make_axes_locatable(ax_main)
ax_top = divider.append_axes("top", 1.2, pad=0.1, sharex=ax_main)
ax_right = divider.append_axes("right", 1.2, pad=0.1, sharey=ax_main)

# Main KDE plot
im = ax_main.imshow(
    density.T,
    extent=[xmin, xmax, ymin, ymax],
    origin='lower',
    cmap='Blues',
    aspect='auto'
)
ax_main.set_xlabel('x')
ax_main.set_ylabel('y')

# Top marginal
ax_top.hist(x, bins=30, density=True, alpha=0.7, color='steelblue')
x_kde = np.linspace(xmin, xmax, 200)
ax_top.plot(x_kde, stats.gaussian_kde(x)(x_kde), 'darkblue', linewidth=2)
ax_top.set_ylabel('Density')
plt.setp(ax_top.get_xticklabels(), visible=False)

# Right marginal
ax_right.hist(y, bins=30, density=True, alpha=0.7, color='steelblue', orientation='horizontal')
y_kde = np.linspace(ymin, ymax, 200)
ax_right.plot(stats.gaussian_kde(y)(y_kde), y_kde, 'darkblue', linewidth=2)
ax_right.set_xlabel('Density')
plt.setp(ax_right.get_yticklabels(), visible=False)

plt.suptitle('2D KDE with Marginal Distributions', fontsize=14, y=1.02)
plt.show()

Colorbar Customization

1. Discrete Colorbar

fig, ax = plt.subplots(figsize=(8, 6))

levels = np.linspace(density.min(), density.max(), 10)
im = ax.imshow(
    density.T,
    extent=[xmin, xmax, ymin, ymax],
    origin='lower',
    cmap='viridis',
    aspect='auto'
)

cbar = plt.colorbar(im, ax=ax, ticks=levels[::2])
cbar.set_label('Density')

ax.set_title('KDE with Custom Colorbar')
plt.show()

2. Logarithmic Scale

from matplotlib.colors import LogNorm

# Ensure positive values
density_pos = np.maximum(density, 1e-10)

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Linear scale
im1 = axes[0].imshow(
    density_pos.T,
    extent=[xmin, xmax, ymin, ymax],
    origin='lower',
    cmap='viridis',
    aspect='auto'
)
axes[0].set_title('Linear Scale')
plt.colorbar(im1, ax=axes[0])

# Log scale
im2 = axes[1].imshow(
    density_pos.T,
    extent=[xmin, xmax, ymin, ymax],
    origin='lower',
    cmap='viridis',
    aspect='auto',
    norm=LogNorm()
)
axes[1].set_title('Logarithmic Scale')
plt.colorbar(im2, ax=axes[1])

plt.tight_layout()
plt.show()

Full Example: Publication Quality

np.random.seed(42)

# Generate data with correlation
n = 1000
rho = 0.7
mean = [0, 0]
cov = [[1, rho], [rho, 1]]
data = np.random.multivariate_normal(mean, cov, n)
x, y = data[:, 0], data[:, 1]

# Compute KDE
xmin, xmax = -4, 4
ymin, ymax = -4, 4
xx, yy = np.mgrid[xmin:xmax:150j, ymin:ymax:150j]
positions = np.vstack([xx.ravel(), yy.ravel()])
kernel = stats.gaussian_kde(np.vstack([x, y]))
density = kernel(positions).reshape(xx.shape)

# Create figure
fig, ax = plt.subplots(figsize=(9, 7))

# KDE heatmap
im = ax.imshow(
    density.T,
    extent=[xmin, xmax, ymin, ymax],
    origin='lower',
    cmap='Blues',
    aspect='equal'
)

# Contour lines
levels = np.percentile(density.ravel(), [20, 40, 60, 80, 95])
cs = ax.contour(xx, yy, density, levels=levels, colors='navy', linewidths=0.8, alpha=0.7)

# Scatter subset
idx = np.random.choice(n, 100, replace=False)
ax.scatter(x[idx], y[idx], c='darkred', s=10, alpha=0.5, zorder=5)

# Labels and title
ax.set_xlabel('$X$', fontsize=12)
ax.set_ylabel('$Y$', fontsize=12)
ax.set_title(f'Bivariate Normal Distribution ($\\rho = {rho}$)', fontsize=14, fontweight='bold')

# Colorbar
cbar = plt.colorbar(im, ax=ax, shrink=0.85)
cbar.set_label('Estimated Density', fontsize=11)

ax.tick_params(labelsize=10)
plt.tight_layout()
plt.show()

Summary

Step Code
Create grid xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
Stack positions positions = np.vstack([xx.ravel(), yy.ravel()])
Compute KDE kernel = stats.gaussian_kde(np.vstack([x, y]))
Evaluate density = kernel(positions).reshape(xx.shape)
Display ax.imshow(density.T, extent=[...], origin='lower')