Skip to content

fillna Method

The fillna() method replaces missing values with specified values. It is one of the most common approaches to handling missing data.

Single Value Fill

Replace all NaN values with a single value.

1. Constant Value

import pandas as pd
import numpy as np

df = pd.DataFrame({
    'temperature': [21, np.nan, 25, np.nan],
    'humidity': [65, 68, np.nan, 75]
})

dg = df.fillna(0)
print(dg)
   temperature  humidity
0         21.0      65.0
1          0.0      68.0
2         25.0       0.0
3          0.0      75.0

2. Mean Fill

df['temperature'].fillna(df['temperature'].mean())

3. Median Fill

df['temperature'].fillna(df['temperature'].median())

Column-specific Fill

Use a dictionary to specify different fill values per column.

1. Dictionary Mapping

url = "https://raw.githubusercontent.com/codebasics/py/master/pandas/5_handling_missing_data_fillna_dropna_interpolate/weather_data.csv"
df = pd.read_csv(url, index_col='day', parse_dates=True)

dg = df.fillna({
    "temperature": 30,
    "windspeed": df.windspeed.mean(),
    "event": "No Event",
})
print(dg)

2. Computed Values

fill_values = {
    'temperature': df['temperature'].mean(),
    'humidity': df['humidity'].median()
}
df.fillna(fill_values)

3. Conditional Fill

df['temperature'] = df['temperature'].fillna(
    df.groupby('region')['temperature'].transform('mean')
)

inplace Parameter

Modify the DataFrame directly without creating a copy.

1. Without inplace

dg = df.fillna(0)  # Returns new DataFrame
# df is unchanged

2. With inplace

df.fillna(0, inplace=True)  # Modifies df directly

3. Modern Practice

Prefer reassignment over inplace=True:

df = df.fillna(0)  # More explicit

LeetCode Example

Fill referee_id with 0 for customers without referrer.

1. Problem Context

customer = pd.DataFrame({
    'customer_id': [1, 2, 3, 4, 5],
    'name': ['Alice', 'Bob', 'Carol', 'Dave', 'Eve'],
    'referee_id': [1.0, 2.0, np.nan, 3.0, np.nan]
})

2. Fill NaN Values

customer["referee_id"].fillna(0)

3. Result

0    1.0
1    2.0
2    0.0
3    3.0
4    0.0
Name: referee_id, dtype: float64

Dictionary Fill Example

Fill missing prices with 0.0 in sales data.

1. Sample Data

sold_with_prices = pd.DataFrame({
    'product_id': [1, 1, 2],
    'purchase_date': ['2024-01-15', '2024-05-10', '2024-07-01'],
    'units': [10, 5, 8],
    'price': [100, None, 180]
})

2. Fill with Dictionary

sold_with_prices.fillna({'price': 0.0}, inplace=True)

3. Resulting DataFrame

   product_id purchase_date  units  price
0           1    2024-01-15     10  100.0
1           1    2024-05-10      5    0.0
2           2    2024-07-01      8  180.0

Runnable Example: data_preprocessing_workflow.py

"""
Data Preprocessing Workflow: Cleaning Real-World Data

A practical workflow demonstrating common data cleaning operations
that are needed before analysis or machine learning.

Steps covered:
1. Handling missing values (detect, fill, drop)
2. Removing duplicates
3. String column splitting and extraction
4. Value replacement and mapping
5. Normalization (min-max scaling, z-score standardization)
6. Binning continuous variables

Based on Python-100-Days Day66-80 day04.ipynb data cleaning examples.
"""

import numpy as np
import pandas as pd


# =============================================================================
# Step 1: Create Sample Messy Data
# =============================================================================

def create_sample_data() -> pd.DataFrame:
    """Create a messy DataFrame that needs preprocessing."""
    data = {
        'name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Diana',
                 'Eve', 'Frank', None, 'Grace', 'Bob'],
        'age': [28, 35, None, 28, 42, 31, None, 29, 38, 35],
        'salary_range': ['50K-70K', '80K-100K', '60K-80K', '50K-70K',
                         '90K-120K', '70K-90K', '55K-75K', '65K-85K',
                         '100K-130K', '80K-100K'],
        'department': ['Engineering', 'Marketing', 'Engineering', 'Engineering',
                       'Management', 'marketing', 'engineering', 'Sales',
                       'Management', 'Marketing'],
        'score': [85, 92, 78, 85, 95, 88, 73, None, 91, 92],
        'join_date': ['2020-03-15', '2019-07-22', '2021-01-10', '2020-03-15',
                      '2018-11-05', '2020-08-17', '2022-02-28', '2021-06-12',
                      '2019-03-08', '2019-07-22'],
    }
    return pd.DataFrame(data)


# =============================================================================
# Step 2: Inspect and Report Issues
# =============================================================================

def inspect_data(df: pd.DataFrame) -> None:
    """Report data quality issues."""
    print("=== Data Inspection ===")
    print(f"Shape: {df.shape}")
    print(f"\nMissing values:\n{df.isnull().sum()}")
    print(f"\nDuplicate rows: {df.duplicated().sum()}")
    print(f"\nData types:\n{df.dtypes}")
    print()


# =============================================================================
# Step 3: Clean the Data
# =============================================================================

def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """Apply a sequence of cleaning operations."""
    df = df.copy()

    # --- Remove duplicates ---
    print("--- Removing Duplicates ---")
    before = len(df)
    df = df.drop_duplicates()
    print(f"  Removed {before - len(df)} duplicate rows")

    # --- Handle missing values ---
    print("\n--- Handling Missing Values ---")

    # Drop rows where name is missing (can't identify)
    df = df.dropna(subset=['name'])
    print(f"  Dropped rows with missing name")

    # Fill numeric missing values with median
    for col in ['age', 'score']:
        median_val = df[col].median()
        filled = df[col].isnull().sum()
        df[col] = df[col].fillna(median_val)
        print(f"  Filled {filled} missing {col} with median ({median_val})")

    # --- Standardize text columns ---
    print("\n--- Standardizing Text ---")
    df['department'] = df['department'].str.strip().str.title()
    print(f"  Departments: {df['department'].unique().tolist()}")

    # --- Parse dates ---
    print("\n--- Parsing Dates ---")
    df['join_date'] = pd.to_datetime(df['join_date'])
    print(f"  Converted join_date to datetime")

    # --- Extract salary range into min/max columns ---
    print("\n--- Extracting Salary Range ---")
    salary_split = df['salary_range'].str.replace('K', '').str.split('-', expand=True)
    df['salary_min'] = salary_split[0].astype(float) * 1000
    df['salary_max'] = salary_split[1].astype(float) * 1000
    df['salary_mid'] = (df['salary_min'] + df['salary_max']) / 2
    print(f"  Created salary_min, salary_max, salary_mid columns")

    return df


# =============================================================================
# Step 4: Normalize Numeric Columns
# =============================================================================

def normalize_data(df: pd.DataFrame) -> pd.DataFrame:
    """Apply normalization techniques."""
    df = df.copy()

    print("\n--- Normalization ---")

    # Min-Max Scaling: scales to [0, 1]
    # formula: (x - min) / (max - min)
    col = 'score'
    min_val, max_val = df[col].min(), df[col].max()
    df['score_minmax'] = (df[col] - min_val) / (max_val - min_val)
    print(f"  Min-Max scaled '{col}': [{df['score_minmax'].min():.2f}, "
          f"{df['score_minmax'].max():.2f}]")

    # Z-Score Standardization: mean=0, std=1
    # formula: (x - mean) / std
    df['score_zscore'] = (df[col] - df[col].mean()) / df[col].std()
    print(f"  Z-Score '{col}': mean={df['score_zscore'].mean():.4f}, "
          f"std={df['score_zscore'].std():.4f}")

    return df


# =============================================================================
# Step 5: Bin Continuous Variables
# =============================================================================

def bin_data(df: pd.DataFrame) -> pd.DataFrame:
    """Create categorical bins from continuous variables."""
    df = df.copy()

    print("\n--- Binning ---")

    # Age bins
    bins = [0, 25, 35, 45, 100]
    labels = ['Junior', 'Mid-Level', 'Senior', 'Executive']
    df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels)
    print(f"  Age groups:\n{df['age_group'].value_counts().to_string()}")

    # Salary quantile bins
    df['salary_quartile'] = pd.qcut(df['salary_mid'], q=4,
                                     labels=['Q1', 'Q2', 'Q3', 'Q4'])
    print(f"\n  Salary quartiles:\n{df['salary_quartile'].value_counts().to_string()}")

    return df


# =============================================================================
# Step 6: Final Report
# =============================================================================

def final_report(original: pd.DataFrame, cleaned: pd.DataFrame) -> None:
    """Show before/after comparison."""
    print("\n=== Final Report ===")
    print(f"Original: {original.shape[0]} rows, {original.shape[1]} columns")
    print(f"Cleaned:  {cleaned.shape[0]} rows, {cleaned.shape[1]} columns")
    print(f"\nMissing values remaining: {cleaned.isnull().sum().sum()}")
    print(f"\nCleaned columns: {cleaned.columns.tolist()}")
    print(f"\nSample (first 3 rows):")
    print(cleaned.head(3).to_string())


# =============================================================================
# Main
# =============================================================================

if __name__ == '__main__':
    # Create and inspect
    raw_df = create_sample_data()
    print("=== Raw Data ===")
    print(raw_df.to_string())
    print()
    inspect_data(raw_df)

    # Clean
    cleaned_df = clean_data(raw_df)

    # Normalize
    cleaned_df = normalize_data(cleaned_df)

    # Bin
    cleaned_df = bin_data(cleaned_df)

    # Report
    final_report(raw_df, cleaned_df)