Utility Functions¶

Mental Model

Utility functions like np.round, np.clip, np.unique, and np.diff handle the small but essential data-wrangling steps that sit between major computations. They are all vectorized, so there is no need to loop. Think of them as the data cleaning and preprocessing layer — the stage that converts raw arrays into analysis-ready form by enforcing bounds, removing duplicates, and detecting changes.

np.round¶

1. Basic Usage¶

Round to specified decimal places.

```python import numpy as np

def main(): x = np.round(3.146592, 2) print(f"np.round(3.146592, 2) = {x}")

y = np.round(-3.146592, 2)
print(f"np.round(-3.146592, 2) = {y}")

if name == "main": main() ```

Output:

np.round(3.146592, 2) = 3.15 np.round(-3.146592, 2) = -3.15

2. Array Rounding¶

```python import numpy as np

def main(): x = np.array([93.324, 84.237, -73.237, 68.342])

print("Original:")
print(x)
print()

rounded = np.round(x, 2)
print("Rounded to 2 decimals:")
print(rounded)

if name == "main": main() ```

3. 2D Array¶

```python import numpy as np

def main(): x = np.array([ [93.324, 84.237, -73.237, 68.342], [97.234, 67.236, -57.236, 23.567], [87.243, 87.423, -38.253, 77.342] ])

print("Original:")
print(x)
print()

rounded = np.round(x, 2)
print("Rounded:")
print(rounded)

if name == "main": main() ```

np.isnan¶

1. Basic Usage¶

Check for NaN (Not a Number) values.

```python import numpy as np

def main(): x = np.array([1.0, np.nan, 3.0, np.nan, 5.0])

print(f"Array: {x}")
print(f"isnan: {np.isnan(x)}")

if name == "main": main() ```

Output:

Array: [ 1. nan 3. nan 5.] isnan: [False True False True False]

2. 2D Array¶

```python import numpy as np

def main(): x = np.array([ [93., 84., 73., 68.], [97., 67., 57., np.nan], [87., 87., np.nan, 77.] ])

print("Array:")
print(x)
print()

bool_nan = np.isnan(x)
print("isnan result:")
print(bool_nan)

if name == "main": main() ```

3. With np.round¶

NaN values pass through rounding unchanged.

```python import numpy as np

def main(): x = np.array([ [93.324, 84.237, -73.237, 68.342], [97.234, 67.236, -57.236, np.nan], [87.243, np.nan, -38.253, 77.342] ])

print("Original:")
print(x)
print()

rounded = np.round(x, 2)
print("Rounded (NaN preserved):")
print(rounded)

if name == "main": main() ```

np.nonzero¶

1. Basic Usage¶

Return indices of non-zero elements.

```python import numpy as np

def main(): a = np.array([0, 1, 0, 2, 0, 3])

indices = np.nonzero(a)

print(f"Array: {a}")
print(f"Nonzero indices: {indices}")
print(f"Nonzero values: {a[indices]}")

if name == "main": main() ```

Output:

Array: [0 1 0 2 0 3] Nonzero indices: (array([1, 3, 5]),) Nonzero values: [1 2 3]

2. With Boolean Condition¶

```python import numpy as np

def main(): np.random.seed(80085) scores = np.round(np.random.uniform(low=30, high=100, size=10))

print(f"Scores: {scores}")
print()

# Boolean mask
failing = scores < 60
print(f"Failing mask: {failing}")

# Indices of failing scores
fail_indices = np.nonzero(failing)
print(f"Failing indices: {fail_indices}")
print(f"Failing indices [0]: {fail_indices[0]}")

if name == "main": main() ```

3. Modify Subset¶

```python import numpy as np

def main(): np.random.seed(80085) scores = np.round(np.random.uniform(low=30, high=100, size=10))

print(f"Original: {scores}")

# Find first 3 failing indices
fail_idx = np.nonzero(scores < 60)[0][:3]
print(f"First 3 failing indices: {fail_idx}")

# Zero them out
scores[fail_idx] = 0.
print(f"Modified: {scores}")

if name == "main": main() ```

np.nan Functions¶

1. np.nansum¶

Sum ignoring NaN values.

```python import numpy as np

def main(): a = np.array([1, 2, np.nan, 4, 5])

print(f"Array: {a}")
print(f"np.sum(a):    {np.sum(a)}")      # NaN propagates
print(f"np.nansum(a): {np.nansum(a)}")   # Ignores NaN

if name == "main": main() ```

2. np.nanmean¶

Mean ignoring NaN values.

```python import numpy as np

def main(): a = np.array([[1, 2, np.nan], [4, np.nan, 6]])

print("Array:")
print(a)
print()

print(f"np.mean(a):    {np.mean(a)}")
print(f"np.nanmean(a): {np.nanmean(a)}")
print()

print(f"np.nanmean(a, axis=0): {np.nanmean(a, axis=0)}")
print(f"np.nanmean(a, axis=1): {np.nanmean(a, axis=1)}")

if name == "main": main() ```

3. Complete nan Family¶

```python import numpy as np

def main(): a = np.array([1, 2, np.nan, 4, np.nan, 6])

print(f"Array: {a}")
print()

funcs = [
    ("nansum", np.nansum),
    ("nanmean", np.nanmean),
    ("nanstd", np.nanstd),
    ("nanvar", np.nanvar),
    ("nanmin", np.nanmin),
    ("nanmax", np.nanmax),
    ("nanmedian", np.nanmedian),
]

for name, func in funcs:
    print(f"np.{name:10}(a) = {func(a):.4f}")

if name == "main": main() ```

np.unique¶

1. Basic Usage¶

Extract unique elements from an array.

```python import numpy as np

def main(): a = np.array([3, 1, 2, 1, 3, 2, 1, 4, 3])

unique_vals = np.unique(a)

print(f"Original: {a}")
print(f"Unique:   {unique_vals}")

if name == "main": main() ```

Output:

Original: [3 1 2 1 3 2 1 4 3] Unique: [1 2 3 4]

2. Return Indices¶

```python import numpy as np

def main(): a = np.array([3, 1, 2, 1, 3, 2, 1, 4, 3])

# return_index: first occurrence of each unique value
unique_vals, first_idx = np.unique(a, return_index=True)

print(f"Original: {a}")
print(f"Unique values: {unique_vals}")
print(f"First indices: {first_idx}")
print(f"Verify: a[first_idx] = {a[first_idx]}")

if name == "main": main() ```

3. Return Inverse¶

```python import numpy as np

def main(): a = np.array([3, 1, 2, 1, 3, 2, 1, 4, 3])

# return_inverse: indices to reconstruct original
unique_vals, inverse = np.unique(a, return_inverse=True)

print(f"Original: {a}")
print(f"Unique: {unique_vals}")
print(f"Inverse: {inverse}")
print(f"Reconstruct: {unique_vals[inverse]}")

if name == "main": main() ```

np.unique Counts¶

1. Return Counts¶

```python import numpy as np

def main(): a = np.array([3, 1, 2, 1, 3, 2, 1, 4, 3])

unique_vals, counts = np.unique(a, return_counts=True)

print(f"Original: {a}")
print(f"Unique: {unique_vals}")
print(f"Counts: {counts}")

# Most frequent
most_freq_idx = np.argmax(counts)
print(f"Most frequent: {unique_vals[most_freq_idx]} ({counts[most_freq_idx]} times)")

if name == "main": main() ```

2. All Returns¶

```python import numpy as np

def main(): a = np.array([3, 1, 2, 1, 3, 2, 1, 4, 3])

unique_vals, first_idx, inverse, counts = np.unique(
    a, return_index=True, return_inverse=True, return_counts=True
)

print(f"Unique values: {unique_vals}")
print(f"First indices: {first_idx}")
print(f"Inverse:       {inverse}")
print(f"Counts:        {counts}")

if name == "main": main() ```

3. 2D Arrays¶

```python import numpy as np

def main(): a = np.array([[1, 2, 1], [3, 2, 1], [1, 2, 1]])

# Unique elements (flattened by default)
print(f"Unique elements: {np.unique(a)}")

# Unique rows
unique_rows = np.unique(a, axis=0)
print(f"Unique rows:\n{unique_rows}")

if name == "main": main() ```

np.unique Applications¶

1. Category Encoding¶

```python import numpy as np

def main(): labels = np.array(['cat', 'dog', 'cat', 'bird', 'dog', 'cat'])

unique_labels, encoded = np.unique(labels, return_inverse=True)

print(f"Original: {labels}")
print(f"Categories: {unique_labels}")
print(f"Encoded: {encoded}")

if name == "main": main() ```

2. Value Counts¶

```python import numpy as np

def main(): data = np.array([1, 2, 2, 3, 3, 3, 4, 4, 4, 4])

values, counts = np.unique(data, return_counts=True)

# Sort by count (descending)
sort_idx = np.argsort(counts)[::-1]

print("Value counts (sorted):")
for v, c in zip(values[sort_idx], counts[sort_idx]):
    print(f"  {v}: {c}")

if name == "main": main() ```

3. Set Operations¶

```python import numpy as np

def main(): a = np.array([1, 2, 3, 4, 5]) b = np.array([4, 5, 6, 7, 8])

print(f"a = {a}")
print(f"b = {b}")
print()

# Union
print(f"Union: {np.union1d(a, b)}")

# Intersection
print(f"Intersection: {np.intersect1d(a, b)}")

# Difference (in a but not b)
print(f"Difference (a-b): {np.setdiff1d(a, b)}")

if name == "main": main() ```

Practical Examples¶

1. Clean Data¶

```python import numpy as np

def main(): # Data with missing values data = np.array([10, 20, np.nan, 40, np.nan, 60])

print(f"Original: {data}")

# Count NaN
nan_count = np.sum(np.isnan(data))
print(f"NaN count: {nan_count}")

# Remove NaN
clean = data[~np.isnan(data)]
print(f"Cleaned: {clean}")

# Replace NaN with mean
filled = data.copy()
filled[np.isnan(filled)] = np.nanmean(data)
print(f"Filled: {filled}")

if name == "main": main() ```

2. Format Output¶

```python import numpy as np

def main(): # Simulation results np.random.seed(42) results = np.random.randn(3, 4) * 100

print("Raw results:")
print(results)
print()

print("Rounded to 1 decimal:")
print(np.round(results, 1))

if name == "main": main() ```

3. Find Outliers¶

```python import numpy as np

def main(): np.random.seed(42) data = np.random.randn(100)

# Add outliers
data[10] = 10
data[50] = -8

# Find outliers (|z| > 3)
z_scores = np.abs((data - np.mean(data)) / np.std(data))
outlier_idx = np.nonzero(z_scores > 3)[0]

print(f"Data shape: {data.shape}")
print(f"Outlier indices: {outlier_idx}")
print(f"Outlier values: {data[outlier_idx]}")
print(f"Outlier z-scores: {np.round(z_scores[outlier_idx], 2)}")

if name == "main": main() ```

Exercises¶

Exercise 1. Create a = np.array([1.234, 2.567, 3.891]). Round to 1 decimal place with np.round, apply np.floor and np.ceil. Print all three results and explain the difference.

Solution to Exercise 1

import numpy as np

a = np.array([1.234, 2.567, 3.891])
print(f"Round: {np.round(a, 1)}")  # [1.2 2.6 3.9]
print(f"Floor: {np.floor(a)}")      # [1. 2. 3.]
print(f"Ceil:  {np.ceil(a)}")       # [2. 3. 4.]

Exercise 2. Use np.unique to find the unique elements and their counts in a = np.array([1, 2, 2, 3, 3, 3, 4, 4, 4, 4]). Print both the unique values and their counts.

Solution to Exercise 2

import numpy as np

a = np.array([1, 2, 2, 3, 3, 3, 4, 4, 4, 4])
vals, counts = np.unique(a, return_counts=True)
print(f"Unique: {vals}")
print(f"Counts: {counts}")

Exercise 3. Use np.diff to compute the first and second differences of a = np.array([1, 4, 9, 16, 25]) (perfect squares). Verify that the first differences are odd numbers and the second differences are all 2.

Solution to Exercise 3

import numpy as np

a = np.array([1, 4, 9, 16, 25])
d1 = np.diff(a)
d2 = np.diff(a, n=2)
print(f"First differences: {d1}")   # [3 5 7 9]
print(f"Second differences: {d2}")  # [2 2 2]
print(f"All second diffs are 2: {np.all(d2 == 2)}")

Exercise 4. Use np.unique with return_inverse=True to encode categorical labels. Given labels = np.array(["cat", "dog", "cat", "bird", "dog", "cat"]), get the unique categories and the integer encoding. Then reconstruct the original labels from the encoding.

Solution to Exercise 4

import numpy as np

labels = np.array(["cat", "dog", "cat", "bird", "dog", "cat"])
categories, encoded = np.unique(labels, return_inverse=True)

print(f"Categories: {categories}")  # ['bird' 'cat' 'dog']
print(f"Encoded:    {encoded}")     # [1 2 1 0 2 1]

# Reconstruct
reconstructed = categories[encoded]
print(f"Reconstructed: {reconstructed}")
print(f"Match: {np.array_equal(labels, reconstructed)}")  # True

Exercise 5. Use np.nan-safe functions to compute summary statistics of an array with missing values. Given a = np.array([1, np.nan, 3, np.nan, 5, 6, np.nan, 8]), compute the mean, max, and count of non-NaN values using np.nanmean, np.nanmax, and np.count_nonzero(~np.isnan(a)).

Solution to Exercise 5

import numpy as np

a = np.array([1, np.nan, 3, np.nan, 5, 6, np.nan, 8])

print(f"nanmean: {np.nanmean(a):.2f}")     # 4.60
print(f"nanmax:  {np.nanmax(a):.2f}")       # 8.00
print(f"nanmin:  {np.nanmin(a):.2f}")       # 1.00

n_valid = np.count_nonzero(~np.isnan(a))
print(f"Valid count: {n_valid}")             # 5
print(f"NaN count:   {np.isnan(a).sum()}")  # 3

# Compare with non-nan-safe versions
print(f"np.mean(a): {np.mean(a)}")  # nan — NaN propagates!