22 minLesson 25 of 34
Python for Data Science
NumPy Arrays & Operations
NumPy Arrays & Operations: Python's Numerical Foundation
NumPy is the backbone of Python's data science ecosystem. Pandas, scikit-learn, PyTorch, TensorFlow — they all use NumPy arrays internally. Mastering NumPy means understanding the tool that powers scientific computing in Python.
Why NumPy?
import numpy as np
import time
# Python list math (slow — loop through each element)
size = 1_000_000
py_list = list(range(size))
start = time.time()
result = [x * 2 for x in py_list]
print(f"Python list: {time.time()-start:.3f}s")
# NumPy array math (fast — vectorized C code)
np_array = np.arange(size)
start = time.time()
result = np_array * 2 # No loop needed!
print(f"NumPy array: {time.time()-start:.3f}s")
# NumPy is typically 50-100x faster
Creating Arrays
# From lists
arr_1d = np.array([1, 2, 3, 4, 5])
arr_2d = np.array([[1, 2, 3], [4, 5, 6]])
arr_3d = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
# Shape and properties
print(arr_2d.shape) # (2, 3) — 2 rows, 3 columns
print(arr_2d.ndim) # 2 dimensions
print(arr_2d.dtype) # int64
print(arr_2d.size) # 6 (total elements)
# Built-in array creation
zeros = np.zeros((3, 4)) # 3×4 array of 0.0
ones = np.ones((2, 3), dtype=int) # 2×3 array of 1 (integer)
eye = np.eye(3) # 3×3 identity matrix
empty = np.empty((2, 2)) # Uninitialized (faster than zeros)
# Ranges
arange = np.arange(0, 10, 2) # [0, 2, 4, 6, 8]
linspace = np.linspace(0, 1, 5) # [0, 0.25, 0.5, 0.75, 1.0]
logspace = np.logspace(0, 3, 4) # [1, 10, 100, 1000]
# Random
np.random.seed(42)
rand_uniform = np.random.rand(3, 4) # Uniform [0, 1)
rand_normal = np.random.randn(3, 4) # Standard normal
rand_int = np.random.randint(0, 10, (3, 4)) # Random integers
rand_choice = np.random.choice([1,2,3,4,5], size=10, replace=True)
Indexing and Slicing
arr = np.array([[1, 2, 3, 4],
[5, 6, 7, 8],
[9, 10, 11, 12]])
# Basic indexing
print(arr[0, 2]) # 3 (row 0, column 2)
print(arr[2, -1]) # 12 (last column)
# Slicing
print(arr[0, :]) # [1 2 3 4] (first row)
print(arr[:, 1]) # [2 6 10] (second column)
print(arr[1:, 2:]) # [[7 8], [11 12]] (submatrix)
print(arr[::2, ::2]) # [[1 3], [9 11]] (every other element)
# Boolean indexing
mask = arr > 6
print(arr[mask]) # [7 8 9 10 11 12]
arr[arr < 5] = 0 # Set all values < 5 to 0
# Fancy indexing
rows = [0, 2]
cols = [1, 3]
print(arr[rows, cols]) # [arr[0,1], arr[2,3]] = [2, 12]
Vectorized Operations
a = np.array([1, 2, 3, 4])
b = np.array([10, 20, 30, 40])
# Elementwise operations — no loops needed
print(a + b) # [11 22 33 44]
print(a * b) # [10 40 90 160]
print(a ** 2) # [1 4 9 16]
print(np.sqrt(a)) # [1. 1.41 1.73 2.]
# Universal functions (ufuncs)
print(np.sin(np.linspace(0, np.pi, 5)))
print(np.exp(a))
print(np.log(a))
print(np.abs(np.array([-3, -1, 2, -4])))
Broadcasting: NumPy's Superpower
Broadcasting lets arrays of different shapes work together automatically.
arr = np.array([[1, 2, 3],
[4, 5, 6]]) # Shape: (2, 3)
# Add a 1D array to each row
row = np.array([10, 20, 30]) # Shape: (3,) → broadcasts to (2, 3)
print(arr + row)
# [[11, 22, 33],
# [14, 25, 36]]
# Add a column vector to each column
col = np.array([[100], [200]]) # Shape: (2, 1) → broadcasts to (2, 3)
print(arr + col)
# [[101, 102, 103],
# [204, 205, 206]]
# Feature normalization using broadcasting
data = np.random.randn(1000, 10)
mean = data.mean(axis=0) # Shape: (10,)
std = data.std(axis=0) # Shape: (10,)
normalized = (data - mean) / std # Broadcasting does the right thing
print(normalized.mean(axis=0).round(10)) # All zeros
print(normalized.std(axis=0).round(10)) # All ones
Aggregation and Statistics
data = np.array([[1, 2, 3], [4, 5, 6]])
# Global aggregation
print(data.sum()) # 21
print(data.mean()) # 3.5
print(data.std()) # 1.708
print(data.max()) # 6
print(data.min()) # 1
# Axis-based aggregation
print(data.sum(axis=0)) # [5, 7, 9] — sum each column
print(data.sum(axis=1)) # [6, 15] — sum each row
print(data.mean(axis=0)) # [2.5, 3.5, 4.5] — mean of each column
# Useful functions
print(np.percentile(data, 75)) # 75th percentile
print(np.median(data))
print(np.argmax(data)) # Index of maximum value
print(np.argmin(data))
print(np.cumsum([1, 2, 3, 4])) # [1, 3, 6, 10]
Linear Algebra
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])
# Matrix multiplication
C = A @ B # [[19, 22], [43, 50]]
C = np.dot(A, B) # Same result
# Linear algebra operations
print(np.linalg.det(A)) # Determinant: -2.0
print(np.linalg.inv(A)) # Inverse matrix
eigenvalues, eigenvectors = np.linalg.eig(A)
print(eigenvalues) # [-0.37, 5.37]
# Solve linear system: Ax = b
b = np.array([5, 6])
x = np.linalg.solve(A, b)
print(x)
# SVD decomposition
U, S, Vt = np.linalg.svd(A)
Reshaping and Stacking
arr = np.arange(12)
# Reshape
matrix = arr.reshape(3, 4) # 3×4
cube = arr.reshape(2, 2, 3) # 3D
flat = matrix.flatten() # Back to 1D (copy)
view = matrix.ravel() # 1D view (no copy if possible)
# Transpose
print(matrix.T) # Swap rows and columns
print(matrix.T.shape) # (4, 3)
# Stack arrays
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])
h_stack = np.hstack([a, b]) # [1, 2, 3, 4, 5, 6]
v_stack = np.vstack([a, b]) # [[1,2,3],[4,5,6]]
combined = np.column_stack([a, b]) # [[1,4],[2,5],[3,6]]
# Split
parts = np.split(np.arange(9), 3) # [array([0,1,2]), array([3,4,5]), array([6,7,8])]
NumPy's speed and expressiveness make it the right tool for any numerical computation in Python — from simple statistics to matrix operations.
Next lesson: Pandas: DataFrames & Series — data analysis with the most powerful Python library.
📱
Get Notes Free →Get this course's notes on Telegram!
Free cheat sheets, summaries & practice exercises