Statistics Methods¶
Mental Model
NumPy's statistics functions (mean, std, var, median, percentile) all follow the same pattern: pass an array and optionally an axis to reduce along. Watch the ddof parameter for std and var -- NumPy defaults to population statistics (ddof=0), while Pandas and most textbooks default to sample statistics (ddof=1).
At a deeper level, statistics are reductions on transformed data:
mean = sum / N, var = mean((x - μ)²), std = sqrt(var). Each statistic
first transforms the array (centering, squaring) and then reduces it. This
is why the axis and keepdims parameters work identically across all
statistical functions — they all share the same reduction machinery.
mean and np.mean¶
1. Basic Usage¶
Compute the arithmetic mean.
```python import numpy as np
def main(): a = np.array([[1, 2, 3], [4, 5, 6]])
print("a =")
print(a)
print()
print(f"{a.mean() = }")
print(f"{a.mean(axis=0) = }")
print(f"{a.mean(axis=1) = }")
if name == "main": main() ```
Output:
``` a = [[1 2 3] [4 5 6]]
a.mean() = 3.5 a.mean(axis=0) = array([2.5, 3.5, 4.5]) a.mean(axis=1) = array([2., 5.]) ```
2. Output Shape¶
```python import numpy as np
def main(): a = np.random.standard_normal((2, 3))
print(f"{a.mean().shape = }")
print(f"{a.mean(axis=0).shape = }")
print(f"{a.mean(axis=1).shape = }")
if name == "main": main() ```
Output:
a.mean().shape = ()
a.mean(axis=0).shape = (3,)
a.mean(axis=1).shape = (2,)
3. Function Syntax¶
```python import numpy as np
def main(): a = np.random.standard_normal((2, 3))
print(f"{np.mean(a).shape = }")
print(f"{np.mean(a, axis=0).shape = }")
print(f"{np.mean(a, axis=1).shape = }")
if name == "main": main() ```
std and np.std¶
1. Basic Usage¶
Compute the standard deviation.
```python import numpy as np
def main(): a = np.array([[1, 2, 3], [4, 5, 6]])
print("a =")
print(a)
print()
print(f"{a.std() = :.4f}")
print(f"{a.std(axis=0) = }")
print(f"{a.std(axis=1) = }")
if name == "main": main() ```
2. Output Shape¶
```python import numpy as np
def main(): a = np.random.standard_normal((2, 3))
print(f"{a.std().shape = }")
print(f"{a.std(axis=0).shape = }")
print(f"{a.std(axis=1).shape = }")
if name == "main": main() ```
3. ddof Parameter¶
The ddof (delta degrees of freedom) parameter controls the divisor: N - ddof.
```python import numpy as np
def main(): a = np.array([1, 2, 3, 4, 5])
# Population std (ddof=0, default)
pop_std = a.std(ddof=0)
# Sample std (ddof=1)
sample_std = a.std(ddof=1)
print(f"Data: {a}")
print(f"Population std (ddof=0): {pop_std:.4f}")
print(f"Sample std (ddof=1): {sample_std:.4f}")
if name == "main": main() ```
var and np.var¶
1. Basic Usage¶
Compute the variance (square of standard deviation).
```python import numpy as np
def main(): a = np.array([[1, 2], [3, 1], [2, 3]])
print("a =")
print(a)
print()
print(f"{a.var() = }")
print(f"{a.var(axis=0) = }")
print(f"{a.var(axis=1) = }")
if name == "main": main() ```
2. Function Syntax¶
```python import numpy as np
def main(): a = np.array([[1, 2], [3, 1], [2, 3]])
print("a =")
print(a)
print()
print(f"{np.var(a) = }")
print(f"{np.var(a, axis=0) = }")
print(f"{np.var(a, axis=1) = }")
if name == "main": main() ```
3. std vs var¶
```python import numpy as np
def main(): a = np.array([1, 2, 3, 4, 5])
variance = a.var()
std_dev = a.std()
print(f"Variance: {variance:.4f}")
print(f"Std Dev: {std_dev:.4f}")
print(f"sqrt(var) = std: {np.sqrt(variance):.4f}")
if name == "main": main() ```
np.median¶
1. Basic Usage¶
Compute the median (middle value).
```python import numpy as np
def main(): # Odd number of elements a = np.array([1, 3, 2, 5, 4]) print(f"a = {a}") print(f"np.median(a) = {np.median(a)}") print()
# Even number of elements (average of two middle)
b = np.array([1, 3, 2, 5, 4, 6])
print(f"b = {b}")
print(f"np.median(b) = {np.median(b)}")
if name == "main": main() ```
Output:
``` a = [1 3 2 5 4] np.median(a) = 3.0
b = [1 3 2 5 4 6] np.median(b) = 3.5 ```
2. No Method Version¶
Unlike mean/std/var, median is only a function (not a method).
```python import numpy as np
def main(): sample_data = np.array([1.5, 2.5, 4, 2, 1, 1])
# np.median() is a function only
population_median = np.median(sample_data)
print(f"{population_median = }")
# No method version
# sample_data.median() # AttributeError
if name == "main": main() ```
3. With axis Parameter¶
```python import numpy as np
def main(): a = np.array([[1, 5, 3], [2, 4, 6]])
print("a =")
print(a)
print()
print(f"np.median(a) = {np.median(a)}")
print(f"np.median(a, axis=0) = {np.median(a, axis=0)}")
print(f"np.median(a, axis=1) = {np.median(a, axis=1)}")
if name == "main": main() ```
np.quantile¶
1. Basic Usage¶
Compute quantiles (percentiles as fractions).
```python import numpy as np
def main(): data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q25 = np.quantile(data, 0.25)
q50 = np.quantile(data, 0.50)
q75 = np.quantile(data, 0.75)
print(f"Data: {data}")
print(f"Q1 (25%): {q25}")
print(f"Q2 (50%): {q50}")
print(f"Q3 (75%): {q75}")
if name == "main": main() ```
2. Five Number Summary¶
```python import numpy as np import matplotlib.pyplot as plt
def main(): data = np.array([1, 2, 0, 0, 0, 1, 3, 1, 2, 1, 2, 4, 5, -1, -2, 0, 8])
quantiles = {
"Min": 0,
"Q1": 0.25,
"Median": 0.5,
"Q3": 0.75,
"Max": 1
}
print("Five Number Summary:")
for label, q in quantiles.items():
quantile_value = np.quantile(data, q)
print(f" {label:6} : {quantile_value}")
fig, ax = plt.subplots(figsize=(2, 3))
ax.boxplot(data)
ax.set_title("Boxplot")
plt.show()
if name == "main": main() ```
3. Multiple Quantiles¶
```python import numpy as np
def main(): data = np.random.randn(1000)
# Compute multiple quantiles at once
qs = [0.1, 0.25, 0.5, 0.75, 0.9]
values = np.quantile(data, qs)
print("Quantiles:")
for q, v in zip(qs, values):
print(f" {q*100:5.1f}%: {v:+.3f}")
if name == "main": main() ```
scipy.stats¶
1. Skewness¶
Skewness measures asymmetry of the distribution.
```python import numpy as np from scipy import stats
def main(): x = np.random.normal(size=(1000,))
skewness = stats.skew(x)
print(f"{stats.skew(x) = :.4f}")
# Positive skew: right tail longer
# Negative skew: left tail longer
# Near 0: symmetric
if name == "main": main() ```
2. Kurtosis¶
Kurtosis measures the "tailedness" of the distribution.
```python import numpy as np from scipy import stats
def main(): x = np.random.normal(size=(1000,))
# scipy returns excess kurtosis (normal = 0)
kurt = stats.kurtosis(x)
print(f"{stats.kurtosis(x) = :.4f}")
# Positive: heavier tails than normal
# Negative: lighter tails than normal
# Near 0: similar to normal
if name == "main": main() ```
3. Complete Summary¶
```python import numpy as np from scipy import stats
def main(): np.random.seed(42) data = np.random.normal(loc=5, scale=2, size=1000)
print("Descriptive Statistics:")
print(f" Mean: {np.mean(data):.4f}")
print(f" Std: {np.std(data):.4f}")
print(f" Var: {np.var(data):.4f}")
print(f" Median: {np.median(data):.4f}")
print(f" Skewness: {stats.skew(data):.4f}")
print(f" Kurtosis: {stats.kurtosis(data):.4f}")
if name == "main": main() ```
Exercises¶
Exercise 1.
Create a = np.array([2, 4, 4, 4, 5, 5, 7, 9]). Compute the mean, median, standard deviation, and variance. Verify that var = std**2.
Solution to Exercise 1
import numpy as np
a = np.array([2, 4, 4, 4, 5, 5, 7, 9], dtype=float)
print(f"Mean: {np.mean(a)}")
print(f"Median: {np.median(a)}")
print(f"Std: {np.std(a):.4f}")
print(f"Var: {np.var(a):.4f}")
print(f"var == std^2: {np.allclose(np.var(a), np.std(a)**2)}")
Exercise 2. Generate 10000 samples from a normal distribution with mean 5 and std 2. Compute the sample mean and standard deviation and verify they are close to the true parameters.
Solution to Exercise 2
import numpy as np
np.random.seed(42)
samples = np.random.normal(loc=5, scale=2, size=10000)
print(f"Sample mean: {samples.mean():.4f} (expected 5)")
print(f"Sample std: {samples.std():.4f} (expected 2)")
Exercise 3.
Compute the 25th, 50th (median), and 75th percentiles of a = np.random.randn(1000) using np.percentile. Verify that the 50th percentile matches np.median(a).
Solution to Exercise 3
import numpy as np
a = np.random.randn(1000)
p25, p50, p75 = np.percentile(a, [25, 50, 75])
print(f"25th: {p25:.4f}, 50th: {p50:.4f}, 75th: {p75:.4f}")
print(f"Median: {np.median(a):.4f}")
print(f"50th == median: {np.allclose(p50, np.median(a))}")
Exercise 4.
Explain the difference between np.std(a) and np.std(a, ddof=1). Compute both for a = np.array([2, 4, 4, 4, 5, 5, 7, 9]) and explain when to use each.
Solution to Exercise 4
import numpy as np
a = np.array([2, 4, 4, 4, 5, 5, 7, 9])
pop_std = np.std(a) # ddof=0 — population std
samp_std = np.std(a, ddof=1) # ddof=1 — sample std
print(f"Population std (ddof=0): {pop_std:.4f}") # 2.0000
print(f"Sample std (ddof=1): {samp_std:.4f}") # 2.1381
# ddof=0 divides by N — correct when a IS the entire population
# ddof=1 divides by N-1 (Bessel's correction) — unbiased when a
# is a sample from a larger population (the common case)
Exercise 5.
Compute the z-scores of each column in a (200, 3) matrix using mean and std with axis=0. Verify that each column of the result has mean approximately 0 and std approximately 1.
Solution to Exercise 5
import numpy as np
X = np.random.randn(200, 3) * 10 + 50
mu = X.mean(axis=0)
sigma = X.std(axis=0)
Z = (X - mu) / sigma
print(f"Column means: {np.round(Z.mean(axis=0), 10)}") # ~[0, 0, 0]
print(f"Column stds: {np.round(Z.std(axis=0), 10)}") # ~[1, 1, 1]