Syllabus Map
NumPy
import numpy as np
Basic Arrays
# Creating arrays
np.array([1, 2, 3])
np.array([[1, 2], [3, 4]])
# Array attributes
arr.shape
arr.ndim
arr.size
arr.dtype
Shorthand Arrays
# Zero/One Arrays
np.zeros((3 ,4))
np.ones((5,))
# Identity Matrix
np.eye(4)
# Range Array
np.arange(0, 10, 2)
# Random Array
np.random.seed(42)
np.random.rand(3, 4) # uniform [0, 1)
np.random.randn(3, 4) # normal (0, 1)
np.random.randint(0, 10, (3, 3))
Selection and Indexing
# Basic slicing
arr = np.arange(10)
arr[2:6]
arr[:5]
arr[::2]
# 2D slicing
A = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
A[0]
A[:,1]
A[1:, :2]
# Filtering
A[A > 5]
A[(A > 2) & (A < 8)]
Vectorised Operations
# Elementwise Operations
x + y
x - y
x * y
x / y
x ** 2
np.sqrt(x)
# Matrix Multiplication
x @ y
np.dot(x, y)
np.cross(x, y)
# Broadcasting
X + 3
X + np.array([1, 2, 3])
Statistics
np.mean(X)
np.median(X)
np.std(X)
np.var(X)
np.min(X)
np.max(X)
np.percentile(X, 95)
Linear Algebra
np.linalg.norm(x)
np.linalg.dot(a,b)
np.linalg.inv(A)
np.linalg.det(A)
np.linalg.eig(A)
np.linalg.svd(A)
Manipulating Arrays
# Reshape
X.reshape(100, 3)
# Flatten
X.flatten()
X.ravel()
# Transpose
X.T
# Expand and Squeeze
np.expand_dims(X, axis=0)
np.squeeze(X)
Stacking and Concatenation
# Stacking
np.hstack([a, b])
np.vstack([a, b])
# Concatenation
np.concatenate([a, b], axis=1)
ML-Specific
# Standardisation
X_std = (X - X.mean(axis=0)) / X.std(axis=0)
# Min-max scaling
X_norm = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
# One-hot encoding
np.eye(num_classes)[labels]
σ(z)=1+e−z1
# Sigmoid
res = 1 / (1 + np.exp(-z))
softmax(z)=∑jNezjezi
# Softmax
exp = np.exp(z - np.max(z))
res = exp / exp.sum(axis=1, keepdims=True)
Pandas
import pandas as pd
Reading Files
# Read a .csv file
df = pd.read_csv("data.csv")
# Read a .xlsx file
df = pd.read_excel("data.xlsx", sheet_name="Sheet1")
# Read a .json file
df = pd.read_json("data.json")
# Read a .jsonl file
df = pd.read_json("data.jsonl", lines=True)
# Read a .parquet file
df = pd.read_parquet("file.parquet")
Inspecting Data
# Output top few rows
df.head()
# Output bottom few rows
df.tail()
# Output information about dataframe
df.info()
df.describe()
df.shape
df.columns
df.index
# Checking for null values
df.isna().sum()
# Checking for duplicates
df.duplicated().sum()
# Checking for unique values
df.nunique()
Selecting Data
# Selecting columns
df["col1"]
df[["col1", "col3"]]
# Selecting rows
df.iloc[2] # By index
df.loc[5] # By label
# Filtering rows
df[df['age'] > 30]
df[(df['age'] > 30) & (df['gender'] == 'M')]
df[df['col'].isin(['A', 'B'])]
# Query rows
df.query("age > 30 and gender == 'M'")
Cleaning Data
# Renaming columns
df.rename(columns={'old':'new'}, inplace=True)
# Dropping columns
df.drop(columns=['col1', 'col2'], inplace=True)
# Dropping rows
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
# Filling with mean/median
df['age'] = df['age'].fillna(df['age'].mean())
df['age'] = df['age'].fillna(df['age'].median())
# Filling with mode
df['gender'] = df['gender'].fillna(df['gender'].mode()[0])
# Filling with forward or backward values
df.fillna(method='ffill')
df.fillna(method='bfill')