Skip to content

dtype and Shape

Type System

1. Built-in dtypes

NumPy provides a rich type system:

import numpy as np

# Integer types
int8 = np.array([1, 2], dtype=np.int8)     # -128 to 127
int16 = np.array([1, 2], dtype=np.int16)   # -32768 to 32767
int32 = np.array([1, 2], dtype=np.int32)
int64 = np.array([1, 2], dtype=np.int64)   # Default integer

# Unsigned integers
uint8 = np.array([1, 2], dtype=np.uint8)   # 0 to 255
uint16 = np.array([1, 2], dtype=np.uint16)

# Floating point
float16 = np.array([1.0], dtype=np.float16)  # Half precision
float32 = np.array([1.0], dtype=np.float32)  # Single precision
float64 = np.array([1.0], dtype=np.float64)  # Double (default)

# Complex
complex64 = np.array([1+2j], dtype=np.complex64)
complex128 = np.array([1+2j], dtype=np.complex128)

# Boolean
bool_arr = np.array([True, False], dtype=np.bool_)

# String
str_arr = np.array(['a', 'b'], dtype='U10')  # Unicode, max 10 chars

2. Type Inspection

arr = np.array([1, 2, 3])
print(arr.dtype)       # dtype('int64')
print(arr.dtype.name)  # 'int64'
print(arr.dtype.kind)  # 'i' (integer)
print(arr.dtype.itemsize)  # 8 bytes

3. Type Casting

# Implicit upcasting
arr1 = np.array([1, 2, 3])       # int64
arr2 = np.array([1.0, 2.0, 3.0]) # float64
result = arr1 + arr2             # float64 (upcast)

# Explicit casting
arr_int = np.array([1.5, 2.7, 3.9])
arr_int_cast = arr_int.astype(np.int32)  # [1, 2, 3]

# Safe casting check
can_cast = np.can_cast(np.float64, np.int32)
print(can_cast)  # False

Shape Metadata

1. Dimension Properties

# 1D array
arr1d = np.array([1, 2, 3, 4])
print(arr1d.shape)  # (4,)
print(arr1d.ndim)   # 1

# 2D array
arr2d = np.array([[1, 2, 3], [4, 5, 6]])
print(arr2d.shape)  # (2, 3)
print(arr2d.ndim)   # 2

# 3D array
arr3d = np.arange(24).reshape(2, 3, 4)
print(arr3d.shape)  # (2, 3, 4)
print(arr3d.ndim)   # 3

2. Reshaping

arr = np.arange(12)

# Explicit shape
reshaped = arr.reshape(3, 4)
print(reshaped.shape)  # (3, 4)

# Infer one dimension
reshaped = arr.reshape(3, -1)  # (3, 4) inferred
reshaped = arr.reshape(-1, 4)  # (3, 4) inferred

# Flatten
flat = reshaped.reshape(-1)  # (12,)

3. Dimension Manipulation

arr = np.array([1, 2, 3])

# Add dimension
arr_2d = arr[np.newaxis, :]  # (1, 3)
arr_2d = arr[:, np.newaxis]  # (3, 1)

# Using reshape
arr_2d = arr.reshape(1, -1)  # (1, 3)
arr_2d = arr.reshape(-1, 1)  # (3, 1)

# Squeeze (remove size-1 dimensions)
arr_squeezed = np.array([[[1, 2, 3]]])  # (1, 1, 3)
print(arr_squeezed.squeeze().shape)     # (3,)

Structured dtypes

1. Record Arrays

# Define structured dtype
dt = np.dtype([('name', 'U10'), ('age', 'i4'), ('weight', 'f4')])

# Create array
data = np.array([
    ('Alice', 25, 55.5),
    ('Bob', 30, 75.0)
], dtype=dt)

print(data['name'])    # ['Alice' 'Bob']
print(data['age'])     # [25 30]
print(data[0])         # ('Alice', 25, 55.5)

2. Field Access

# Access by field
ages = data['age']
ages[0] = 26
print(data[0]['age'])  # 26 - structured array is a view

3. Nested Structures

dt = np.dtype([
    ('name', 'U10'),
    ('position', [('x', 'f4'), ('y', 'f4')])
])

data = np.array([
    ('Alice', (1.0, 2.0)),
    ('Bob', (3.0, 4.0))
], dtype=dt)

print(data['position']['x'])  # [1. 3.]

Type Promotion

1. Automatic Promotion

# Integer + Float → Float
arr_int = np.array([1, 2, 3])
arr_float = np.array([1.0, 2.0, 3.0])
result = arr_int + arr_float
print(result.dtype)  # float64

# Smaller + Larger → Larger
arr_8 = np.array([1], dtype=np.int8)
arr_64 = np.array([1], dtype=np.int64)
result = arr_8 + arr_64
print(result.dtype)  # int64

2. Result Type

# Check result dtype before operation
result_dtype = np.result_type(np.int8, np.float32)
print(result_dtype)  # float32

3. Promotion Rules

# bool < int < float < complex
print(np.result_type(np.bool_, np.int32))     # int32
print(np.result_type(np.int32, np.float64))   # float64
print(np.result_type(np.float32, np.complex64))  # complex64

Memory Efficiency

1. Choosing dtypes

# Small integers - use smaller types
small_ints = np.arange(100, dtype=np.int8)  # 100 bytes
large_ints = np.arange(100, dtype=np.int64) # 800 bytes

# Precision trade-offs
half = np.array([1.0], dtype=np.float16)    # 2 bytes, less precision
double = np.array([1.0], dtype=np.float64)  # 8 bytes, more precision

2. String Optimization

# Fixed-length strings
arr = np.array(['a', 'bb', 'ccc'], dtype='U3')  # 3 chars max
print(arr.dtype)  # '<U3'
print(arr.nbytes) # 36 bytes (3 items × 3 chars × 4 bytes/char)

# Oversized strings waste memory
arr_big = np.array(['a', 'bb', 'ccc'], dtype='U100')  # wasteful
print(arr_big.nbytes)  # 1200 bytes

3. Boolean Masking

# Boolean arrays are memory efficient for masks
arr = np.arange(1000000)
mask = arr > 500000  # dtype=bool, 1 byte per element
filtered = arr[mask]

Shape Broadcasting

1. Dimension Compatibility

# Same shape - compatible
arr1 = np.ones((3, 4))
arr2 = np.ones((3, 4))
result = arr1 + arr2  # ✅ (3, 4)

# Trailing dimensions match
arr1 = np.ones((5, 3, 4))
arr2 = np.ones((3, 4))
result = arr1 + arr2  # ✅ (5, 3, 4)

# Size-1 dimensions stretch
arr1 = np.ones((3, 1))
arr2 = np.ones((1, 4))
result = arr1 + arr2  # ✅ (3, 4)

2. Shape Rules

Broadcasting works when: - Dimensions are equal, or - One dimension is 1, or - Dimension doesn't exist (added as size 1)

# (3, 4) + (4,) → (3, 4) + (1, 4) → (3, 4) ✅
# (3, 4) + (3,) → incompatible ❌
# (3, 4) + (3, 1) → (3, 4) ✅

3. Explicit Broadcasting

arr1 = np.array([[1], [2], [3]])  # (3, 1)
arr2 = np.array([10, 20, 30])     # (3,)

# Manual broadcast
arr2_broadcast = arr2[np.newaxis, :]  # (1, 3)
result = arr1 + arr2_broadcast        # (3, 3)