python
exercises
exercises.py🐍python
"""
Python Data Science and Machine Learning - Exercises
Practice NumPy, Pandas, visualization, and ML concepts
"""
import numpy as np
from typing import List, Tuple, Dict, Any, Optional
# ============================================================
# EXERCISE 1: NumPy Array Operations
# ============================================================
"""
Create functions for common array operations.
"""
def create_identity_matrix(n: int) -> np.ndarray:
"""
Create an n x n identity matrix.
Example:
>>> create_identity_matrix(3)
array([[1., 0., 0.],
[0., 1., 0.],
[0., 0., 1.]])
"""
# YOUR CODE HERE
pass
def normalize_array(arr: np.ndarray) -> np.ndarray:
"""
Normalize array to have mean=0 and std=1 (z-score normalization).
Formula: (x - mean) / std
Example:
>>> arr = np.array([10, 20, 30, 40, 50])
>>> normalize_array(arr)
array([-1.41, -0.71, 0. , 0.71, 1.41]) # approximately
"""
# YOUR CODE HERE
pass
def min_max_scale(arr: np.ndarray) -> np.ndarray:
"""
Scale array to range [0, 1].
Formula: (x - min) / (max - min)
Example:
>>> arr = np.array([10, 20, 30, 40, 50])
>>> min_max_scale(arr)
array([0. , 0.25, 0.5 , 0.75, 1. ])
"""
# YOUR CODE HERE
pass
# ============================================================
# EXERCISE 2: Matrix Operations
# ============================================================
"""
Implement common matrix operations.
"""
def matrix_multiply(A: np.ndarray, B: np.ndarray) -> np.ndarray:
"""
Multiply two matrices A and B.
Example:
>>> A = np.array([[1, 2], [3, 4]])
>>> B = np.array([[5, 6], [7, 8]])
>>> matrix_multiply(A, B)
array([[19, 22],
[43, 50]])
"""
# YOUR CODE HERE
pass
def compute_statistics(arr: np.ndarray) -> Dict[str, float]:
"""
Compute common statistics for an array.
Returns:
Dictionary with: mean, median, std, var, min, max, range
Example:
>>> arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
>>> stats = compute_statistics(arr)
>>> stats['mean']
5.5
"""
# YOUR CODE HERE
pass
# ============================================================
# EXERCISE 3: Pandas Data Cleaning (without pandas import)
# ============================================================
"""
Implement data cleaning functions using numpy.
These simulate what pandas does internally.
"""
def find_missing_values(data: np.ndarray) -> Tuple[int, np.ndarray]:
"""
Find missing values (NaN) in array.
Returns:
(count of missing, indices of missing values)
Example:
>>> data = np.array([1, np.nan, 3, np.nan, 5])
>>> count, indices = find_missing_values(data)
>>> count
2
>>> indices
array([1, 3])
"""
# YOUR CODE HERE
pass
def fill_missing_with_mean(data: np.ndarray) -> np.ndarray:
"""
Replace NaN values with the mean of non-NaN values.
Example:
>>> data = np.array([1, np.nan, 3, np.nan, 5])
>>> fill_missing_with_mean(data)
array([1., 3., 3., 3., 5.])
"""
# YOUR CODE HERE
pass
def remove_outliers(data: np.ndarray, threshold: float = 2.0) -> np.ndarray:
"""
Remove outliers beyond threshold standard deviations from mean.
Example:
>>> data = np.array([1, 2, 3, 4, 5, 100])
>>> remove_outliers(data, threshold=2)
array([1, 2, 3, 4, 5])
"""
# YOUR CODE HERE
pass
# ============================================================
# EXERCISE 4: Simple Linear Regression from Scratch
# ============================================================
"""
Implement linear regression without using sklearn.
"""
class SimpleLinearRegression:
"""
Simple linear regression using ordinary least squares.
Formula: y = mx + b
m = sum((x - x_mean)(y - y_mean)) / sum((x - x_mean)^2)
b = y_mean - m * x_mean
Example:
>>> X = np.array([1, 2, 3, 4, 5])
>>> y = np.array([2, 4, 6, 8, 10])
>>> model = SimpleLinearRegression()
>>> model.fit(X, y)
>>> model.predict(np.array([6, 7]))
array([12., 14.])
"""
def __init__(self):
self.slope = None
self.intercept = None
def fit(self, X: np.ndarray, y: np.ndarray):
"""Fit the model to training data."""
# YOUR CODE HERE
pass
def predict(self, X: np.ndarray) -> np.ndarray:
"""Make predictions."""
# YOUR CODE HERE
pass
def r_squared(self, X: np.ndarray, y: np.ndarray) -> float:
"""Calculate R² score."""
# YOUR CODE HERE
pass
# ============================================================
# EXERCISE 5: K-Nearest Neighbors from Scratch
# ============================================================
"""
Implement KNN classifier without using sklearn.
"""
class SimpleKNN:
"""
K-Nearest Neighbors classifier.
Example:
>>> X_train = np.array([[1, 1], [2, 2], [3, 3], [4, 4]])
>>> y_train = np.array([0, 0, 1, 1])
>>> knn = SimpleKNN(k=3)
>>> knn.fit(X_train, y_train)
>>> knn.predict(np.array([[2.5, 2.5]]))
array([0]) # or [1], depending on implementation
"""
def __init__(self, k: int = 3):
self.k = k
self.X_train = None
self.y_train = None
def fit(self, X: np.ndarray, y: np.ndarray):
"""Store training data."""
# YOUR CODE HERE
pass
def _euclidean_distance(self, x1: np.ndarray, x2: np.ndarray) -> float:
"""Calculate Euclidean distance between two points."""
# YOUR CODE HERE
pass
def _predict_single(self, x: np.ndarray) -> int:
"""Predict class for a single sample."""
# YOUR CODE HERE
pass
def predict(self, X: np.ndarray) -> np.ndarray:
"""Predict classes for multiple samples."""
# YOUR CODE HERE
pass
# ============================================================
# EXERCISE 6: Data Preprocessing Pipeline
# ============================================================
"""
Create a preprocessing pipeline class.
"""
class DataPreprocessor:
"""
Preprocessing pipeline for numerical data.
Supports:
- Handling missing values
- Scaling (standardization or normalization)
- Outlier removal
Example:
>>> preprocessor = DataPreprocessor(
... handle_missing='mean',
... scaling='standard',
... remove_outliers_threshold=3.0
... )
>>> preprocessor.fit(X_train)
>>> X_train_clean = preprocessor.transform(X_train)
>>> X_test_clean = preprocessor.transform(X_test)
"""
def __init__(
self,
handle_missing: str = 'mean', # 'mean', 'median', 'drop'
scaling: Optional[str] = None, # 'standard', 'minmax', None
remove_outliers_threshold: Optional[float] = None
):
self.handle_missing = handle_missing
self.scaling = scaling
self.remove_outliers_threshold = remove_outliers_threshold
# These are computed during fit
self._means = None
self._stds = None
self._mins = None
self._maxs = None
self._medians = None
def fit(self, X: np.ndarray):
"""Compute statistics from training data."""
# YOUR CODE HERE
pass
def transform(self, X: np.ndarray) -> np.ndarray:
"""Apply preprocessing to data."""
# YOUR CODE HERE
pass
def fit_transform(self, X: np.ndarray) -> np.ndarray:
"""Fit and transform in one step."""
self.fit(X)
return self.transform(X)
# ============================================================
# EXERCISE 7: Confusion Matrix and Metrics
# ============================================================
"""
Implement classification metrics from scratch.
"""
def confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
"""
Compute confusion matrix for binary classification.
Returns:
2x2 array: [[TN, FP], [FN, TP]]
Example:
>>> y_true = np.array([0, 0, 1, 1, 1])
>>> y_pred = np.array([0, 1, 0, 1, 1])
>>> confusion_matrix(y_true, y_pred)
array([[1, 1],
[1, 2]])
"""
# YOUR CODE HERE
pass
def precision_recall_f1(y_true: np.ndarray, y_pred: np.ndarray) -> Dict[str, float]:
"""
Calculate precision, recall, and F1 score.
Precision = TP / (TP + FP)
Recall = TP / (TP + FN)
F1 = 2 * (precision * recall) / (precision + recall)
Example:
>>> y_true = np.array([0, 0, 1, 1, 1])
>>> y_pred = np.array([0, 1, 0, 1, 1])
>>> metrics = precision_recall_f1(y_true, y_pred)
>>> metrics['precision']
0.6666...
"""
# YOUR CODE HERE
pass
# ============================================================
# EXERCISE 8: Cross-Validation from Scratch
# ============================================================
"""
Implement k-fold cross-validation.
"""
def k_fold_split(
X: np.ndarray,
y: np.ndarray,
k: int = 5,
shuffle: bool = True,
random_state: Optional[int] = None
) -> List[Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]:
"""
Split data into k folds for cross-validation.
Returns:
List of (X_train, X_val, y_train, y_val) tuples
Example:
>>> X = np.arange(10).reshape(-1, 1)
>>> y = np.arange(10)
>>> folds = k_fold_split(X, y, k=5)
>>> len(folds)
5
>>> X_train, X_val, y_train, y_val = folds[0]
"""
# YOUR CODE HERE
pass
def cross_val_score(
model, # Any model with fit and predict methods
X: np.ndarray,
y: np.ndarray,
k: int = 5,
metric_fn = None # Function(y_true, y_pred) -> score
) -> np.ndarray:
"""
Perform k-fold cross-validation and return scores.
Example:
>>> model = SimpleLinearRegression()
>>> scores = cross_val_score(model, X, y, k=5, metric_fn=r2_score)
>>> print(f"Mean: {scores.mean():.3f} (+/- {scores.std()*2:.3f})")
"""
# YOUR CODE HERE
pass
# ============================================================
# EXERCISE 9: Feature Engineering
# ============================================================
"""
Implement feature engineering utilities.
"""
def polynomial_features(X: np.ndarray, degree: int = 2) -> np.ndarray:
"""
Generate polynomial features up to given degree.
For degree=2 and single feature x: returns [1, x, x²]
For degree=2 and features [x1, x2]: returns [1, x1, x2, x1², x1*x2, x2²]
Example:
>>> X = np.array([[1], [2], [3]])
>>> polynomial_features(X, degree=2)
array([[1, 1, 1],
[1, 2, 4],
[1, 3, 9]])
"""
# YOUR CODE HERE (simplified version for single feature)
pass
def one_hot_encode(arr: np.ndarray) -> np.ndarray:
"""
One-hot encode a categorical array.
Example:
>>> arr = np.array([0, 1, 2, 0, 1])
>>> one_hot_encode(arr)
array([[1, 0, 0],
[0, 1, 0],
[0, 0, 1],
[1, 0, 0],
[0, 1, 0]])
"""
# YOUR CODE HERE
pass
# ============================================================
# EXERCISE 10: Complete ML Pipeline
# ============================================================
"""
Create a complete ML pipeline class.
"""
class MLPipeline:
"""
Complete machine learning pipeline.
Steps:
1. Train/test split
2. Preprocessing
3. Model training
4. Evaluation
Example:
>>> pipeline = MLPipeline(
... model=SimpleLinearRegression(),
... preprocessor=DataPreprocessor(scaling='standard'),
... test_size=0.2
... )
>>> results = pipeline.run(X, y)
>>> print(results['test_score'])
"""
def __init__(
self,
model, # Model with fit, predict methods
preprocessor: Optional[DataPreprocessor] = None,
test_size: float = 0.2,
random_state: Optional[int] = None
):
self.model = model
self.preprocessor = preprocessor
self.test_size = test_size
self.random_state = random_state
self.X_train = None
self.X_test = None
self.y_train = None
self.y_test = None
def _train_test_split(
self,
X: np.ndarray,
y: np.ndarray
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
"""Split data into train and test sets."""
# YOUR CODE HERE
pass
def run(self, X: np.ndarray, y: np.ndarray) -> Dict[str, Any]:
"""
Run the complete pipeline.
Returns:
Dictionary with train_score, test_score, predictions
"""
# YOUR CODE HERE
pass
# ============================================================
# SOLUTIONS (Uncomment to check your work)
# ============================================================
"""
# Solution 1: NumPy Array Operations
def create_identity_matrix(n: int) -> np.ndarray:
return np.eye(n)
def normalize_array(arr: np.ndarray) -> np.ndarray:
return (arr - arr.mean()) / arr.std()
def min_max_scale(arr: np.ndarray) -> np.ndarray:
return (arr - arr.min()) / (arr.max() - arr.min())
# Solution 2: Matrix Operations
def matrix_multiply(A: np.ndarray, B: np.ndarray) -> np.ndarray:
return np.dot(A, B) # or A @ B
def compute_statistics(arr: np.ndarray) -> Dict[str, float]:
return {
'mean': float(np.mean(arr)),
'median': float(np.median(arr)),
'std': float(np.std(arr)),
'var': float(np.var(arr)),
'min': float(np.min(arr)),
'max': float(np.max(arr)),
'range': float(np.max(arr) - np.min(arr))
}
# Solution 3: Data Cleaning
def find_missing_values(data: np.ndarray) -> Tuple[int, np.ndarray]:
mask = np.isnan(data)
return int(mask.sum()), np.where(mask)[0]
def fill_missing_with_mean(data: np.ndarray) -> np.ndarray:
result = data.copy()
mean_val = np.nanmean(data)
result[np.isnan(result)] = mean_val
return result
def remove_outliers(data: np.ndarray, threshold: float = 2.0) -> np.ndarray:
mean = np.mean(data)
std = np.std(data)
mask = np.abs(data - mean) <= threshold * std
return data[mask]
# Solution 4: Simple Linear Regression
class SimpleLinearRegression:
def __init__(self):
self.slope = None
self.intercept = None
def fit(self, X: np.ndarray, y: np.ndarray):
X = np.asarray(X).flatten()
y = np.asarray(y).flatten()
x_mean = X.mean()
y_mean = y.mean()
numerator = np.sum((X - x_mean) * (y - y_mean))
denominator = np.sum((X - x_mean) ** 2)
self.slope = numerator / denominator
self.intercept = y_mean - self.slope * x_mean
def predict(self, X: np.ndarray) -> np.ndarray:
X = np.asarray(X).flatten()
return self.slope * X + self.intercept
def r_squared(self, X: np.ndarray, y: np.ndarray) -> float:
y_pred = self.predict(X)
ss_res = np.sum((y - y_pred) ** 2)
ss_tot = np.sum((y - y.mean()) ** 2)
return 1 - (ss_res / ss_tot)
# Solution 5: KNN
class SimpleKNN:
def __init__(self, k: int = 3):
self.k = k
self.X_train = None
self.y_train = None
def fit(self, X: np.ndarray, y: np.ndarray):
self.X_train = np.asarray(X)
self.y_train = np.asarray(y)
def _euclidean_distance(self, x1: np.ndarray, x2: np.ndarray) -> float:
return np.sqrt(np.sum((x1 - x2) ** 2))
def _predict_single(self, x: np.ndarray) -> int:
distances = [self._euclidean_distance(x, x_train)
for x_train in self.X_train]
k_indices = np.argsort(distances)[:self.k]
k_labels = self.y_train[k_indices]
# Return most common label
values, counts = np.unique(k_labels, return_counts=True)
return values[np.argmax(counts)]
def predict(self, X: np.ndarray) -> np.ndarray:
X = np.asarray(X)
return np.array([self._predict_single(x) for x in X])
# Solution 7: Confusion Matrix
def confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
tn = np.sum((y_true == 0) & (y_pred == 0))
fp = np.sum((y_true == 0) & (y_pred == 1))
fn = np.sum((y_true == 1) & (y_pred == 0))
tp = np.sum((y_true == 1) & (y_pred == 1))
return np.array([[tn, fp], [fn, tp]])
def precision_recall_f1(y_true: np.ndarray, y_pred: np.ndarray) -> Dict[str, float]:
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1]
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
return {
'precision': precision,
'recall': recall,
'f1': f1
}
# Solution 9: Feature Engineering
def polynomial_features(X: np.ndarray, degree: int = 2) -> np.ndarray:
X = np.asarray(X)
if X.ndim == 1:
X = X.reshape(-1, 1)
n_samples = X.shape[0]
result = np.ones((n_samples, 1)) # Start with bias term
for d in range(1, degree + 1):
result = np.hstack([result, X ** d])
return result
def one_hot_encode(arr: np.ndarray) -> np.ndarray:
n_samples = len(arr)
n_classes = len(np.unique(arr))
result = np.zeros((n_samples, n_classes), dtype=int)
result[np.arange(n_samples), arr] = 1
return result
"""
if __name__ == "__main__":
print("Data Science and Machine Learning Exercises")
print("=" * 50)
print("\nComplete the exercises above to practice:")
print("- NumPy array operations")
print("- Data preprocessing")
print("- Building ML algorithms from scratch")
print("- Evaluation metrics")
print("- Cross-validation")
print("- Feature engineering")
print("\nUncomment the solutions to check your work!")
# Quick tests
print("\n--- Quick Tests ---")
# Test normalize_array if implemented
try:
arr = np.array([10, 20, 30, 40, 50])
result = normalize_array(arr)
if result is not None:
print(f"normalize_array: {result}")
except:
pass
print("\nRun this file after implementing the exercises!")