python
exercises

exercises.py🐍python
"""
Python Data Science and Machine Learning - Exercises
Practice NumPy, Pandas, visualization, and ML concepts
"""

import numpy as np
from typing import List, Tuple, Dict, Any, Optional


# ============================================================
# EXERCISE 1: NumPy Array Operations
# ============================================================
"""
Create functions for common array operations.
"""

def create_identity_matrix(n: int) -> np.ndarray:
    """
    Create an n x n identity matrix.
    
    Example:
        >>> create_identity_matrix(3)
        array([[1., 0., 0.],
               [0., 1., 0.],
               [0., 0., 1.]])
    """
    # YOUR CODE HERE
    pass


def normalize_array(arr: np.ndarray) -> np.ndarray:
    """
    Normalize array to have mean=0 and std=1 (z-score normalization).
    
    Formula: (x - mean) / std
    
    Example:
        >>> arr = np.array([10, 20, 30, 40, 50])
        >>> normalize_array(arr)
        array([-1.41, -0.71,  0.  ,  0.71,  1.41])  # approximately
    """
    # YOUR CODE HERE
    pass


def min_max_scale(arr: np.ndarray) -> np.ndarray:
    """
    Scale array to range [0, 1].
    
    Formula: (x - min) / (max - min)
    
    Example:
        >>> arr = np.array([10, 20, 30, 40, 50])
        >>> min_max_scale(arr)
        array([0.  , 0.25, 0.5 , 0.75, 1.  ])
    """
    # YOUR CODE HERE
    pass


# ============================================================
# EXERCISE 2: Matrix Operations
# ============================================================
"""
Implement common matrix operations.
"""

def matrix_multiply(A: np.ndarray, B: np.ndarray) -> np.ndarray:
    """
    Multiply two matrices A and B.
    
    Example:
        >>> A = np.array([[1, 2], [3, 4]])
        >>> B = np.array([[5, 6], [7, 8]])
        >>> matrix_multiply(A, B)
        array([[19, 22],
               [43, 50]])
    """
    # YOUR CODE HERE
    pass


def compute_statistics(arr: np.ndarray) -> Dict[str, float]:
    """
    Compute common statistics for an array.
    
    Returns:
        Dictionary with: mean, median, std, var, min, max, range
    
    Example:
        >>> arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
        >>> stats = compute_statistics(arr)
        >>> stats['mean']
        5.5
    """
    # YOUR CODE HERE
    pass


# ============================================================
# EXERCISE 3: Pandas Data Cleaning (without pandas import)
# ============================================================
"""
Implement data cleaning functions using numpy.
These simulate what pandas does internally.
"""

def find_missing_values(data: np.ndarray) -> Tuple[int, np.ndarray]:
    """
    Find missing values (NaN) in array.
    
    Returns:
        (count of missing, indices of missing values)
    
    Example:
        >>> data = np.array([1, np.nan, 3, np.nan, 5])
        >>> count, indices = find_missing_values(data)
        >>> count
        2
        >>> indices
        array([1, 3])
    """
    # YOUR CODE HERE
    pass


def fill_missing_with_mean(data: np.ndarray) -> np.ndarray:
    """
    Replace NaN values with the mean of non-NaN values.
    
    Example:
        >>> data = np.array([1, np.nan, 3, np.nan, 5])
        >>> fill_missing_with_mean(data)
        array([1., 3., 3., 3., 5.])
    """
    # YOUR CODE HERE
    pass


def remove_outliers(data: np.ndarray, threshold: float = 2.0) -> np.ndarray:
    """
    Remove outliers beyond threshold standard deviations from mean.
    
    Example:
        >>> data = np.array([1, 2, 3, 4, 5, 100])
        >>> remove_outliers(data, threshold=2)
        array([1, 2, 3, 4, 5])
    """
    # YOUR CODE HERE
    pass


# ============================================================
# EXERCISE 4: Simple Linear Regression from Scratch
# ============================================================
"""
Implement linear regression without using sklearn.
"""

class SimpleLinearRegression:
    """
    Simple linear regression using ordinary least squares.
    
    Formula: y = mx + b
    
    m = sum((x - x_mean)(y - y_mean)) / sum((x - x_mean)^2)
    b = y_mean - m * x_mean
    
    Example:
        >>> X = np.array([1, 2, 3, 4, 5])
        >>> y = np.array([2, 4, 6, 8, 10])
        >>> model = SimpleLinearRegression()
        >>> model.fit(X, y)
        >>> model.predict(np.array([6, 7]))
        array([12., 14.])
    """
    
    def __init__(self):
        self.slope = None
        self.intercept = None
    
    def fit(self, X: np.ndarray, y: np.ndarray):
        """Fit the model to training data."""
        # YOUR CODE HERE
        pass
    
    def predict(self, X: np.ndarray) -> np.ndarray:
        """Make predictions."""
        # YOUR CODE HERE
        pass
    
    def r_squared(self, X: np.ndarray, y: np.ndarray) -> float:
        """Calculate R² score."""
        # YOUR CODE HERE
        pass


# ============================================================
# EXERCISE 5: K-Nearest Neighbors from Scratch
# ============================================================
"""
Implement KNN classifier without using sklearn.
"""

class SimpleKNN:
    """
    K-Nearest Neighbors classifier.
    
    Example:
        >>> X_train = np.array([[1, 1], [2, 2], [3, 3], [4, 4]])
        >>> y_train = np.array([0, 0, 1, 1])
        >>> knn = SimpleKNN(k=3)
        >>> knn.fit(X_train, y_train)
        >>> knn.predict(np.array([[2.5, 2.5]]))
        array([0])  # or [1], depending on implementation
    """
    
    def __init__(self, k: int = 3):
        self.k = k
        self.X_train = None
        self.y_train = None
    
    def fit(self, X: np.ndarray, y: np.ndarray):
        """Store training data."""
        # YOUR CODE HERE
        pass
    
    def _euclidean_distance(self, x1: np.ndarray, x2: np.ndarray) -> float:
        """Calculate Euclidean distance between two points."""
        # YOUR CODE HERE
        pass
    
    def _predict_single(self, x: np.ndarray) -> int:
        """Predict class for a single sample."""
        # YOUR CODE HERE
        pass
    
    def predict(self, X: np.ndarray) -> np.ndarray:
        """Predict classes for multiple samples."""
        # YOUR CODE HERE
        pass


# ============================================================
# EXERCISE 6: Data Preprocessing Pipeline
# ============================================================
"""
Create a preprocessing pipeline class.
"""

class DataPreprocessor:
    """
    Preprocessing pipeline for numerical data.
    
    Supports:
    - Handling missing values
    - Scaling (standardization or normalization)
    - Outlier removal
    
    Example:
        >>> preprocessor = DataPreprocessor(
        ...     handle_missing='mean',
        ...     scaling='standard',
        ...     remove_outliers_threshold=3.0
        ... )
        >>> preprocessor.fit(X_train)
        >>> X_train_clean = preprocessor.transform(X_train)
        >>> X_test_clean = preprocessor.transform(X_test)
    """
    
    def __init__(
        self,
        handle_missing: str = 'mean',  # 'mean', 'median', 'drop'
        scaling: Optional[str] = None,  # 'standard', 'minmax', None
        remove_outliers_threshold: Optional[float] = None
    ):
        self.handle_missing = handle_missing
        self.scaling = scaling
        self.remove_outliers_threshold = remove_outliers_threshold
        
        # These are computed during fit
        self._means = None
        self._stds = None
        self._mins = None
        self._maxs = None
        self._medians = None
    
    def fit(self, X: np.ndarray):
        """Compute statistics from training data."""
        # YOUR CODE HERE
        pass
    
    def transform(self, X: np.ndarray) -> np.ndarray:
        """Apply preprocessing to data."""
        # YOUR CODE HERE
        pass
    
    def fit_transform(self, X: np.ndarray) -> np.ndarray:
        """Fit and transform in one step."""
        self.fit(X)
        return self.transform(X)


# ============================================================
# EXERCISE 7: Confusion Matrix and Metrics
# ============================================================
"""
Implement classification metrics from scratch.
"""

def confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
    """
    Compute confusion matrix for binary classification.
    
    Returns:
        2x2 array: [[TN, FP], [FN, TP]]
    
    Example:
        >>> y_true = np.array([0, 0, 1, 1, 1])
        >>> y_pred = np.array([0, 1, 0, 1, 1])
        >>> confusion_matrix(y_true, y_pred)
        array([[1, 1],
               [1, 2]])
    """
    # YOUR CODE HERE
    pass


def precision_recall_f1(y_true: np.ndarray, y_pred: np.ndarray) -> Dict[str, float]:
    """
    Calculate precision, recall, and F1 score.
    
    Precision = TP / (TP + FP)
    Recall = TP / (TP + FN)
    F1 = 2 * (precision * recall) / (precision + recall)
    
    Example:
        >>> y_true = np.array([0, 0, 1, 1, 1])
        >>> y_pred = np.array([0, 1, 0, 1, 1])
        >>> metrics = precision_recall_f1(y_true, y_pred)
        >>> metrics['precision']
        0.6666...
    """
    # YOUR CODE HERE
    pass


# ============================================================
# EXERCISE 8: Cross-Validation from Scratch
# ============================================================
"""
Implement k-fold cross-validation.
"""

def k_fold_split(
    X: np.ndarray,
    y: np.ndarray,
    k: int = 5,
    shuffle: bool = True,
    random_state: Optional[int] = None
) -> List[Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]:
    """
    Split data into k folds for cross-validation.
    
    Returns:
        List of (X_train, X_val, y_train, y_val) tuples
    
    Example:
        >>> X = np.arange(10).reshape(-1, 1)
        >>> y = np.arange(10)
        >>> folds = k_fold_split(X, y, k=5)
        >>> len(folds)
        5
        >>> X_train, X_val, y_train, y_val = folds[0]
    """
    # YOUR CODE HERE
    pass


def cross_val_score(
    model,  # Any model with fit and predict methods
    X: np.ndarray,
    y: np.ndarray,
    k: int = 5,
    metric_fn = None  # Function(y_true, y_pred) -> score
) -> np.ndarray:
    """
    Perform k-fold cross-validation and return scores.
    
    Example:
        >>> model = SimpleLinearRegression()
        >>> scores = cross_val_score(model, X, y, k=5, metric_fn=r2_score)
        >>> print(f"Mean: {scores.mean():.3f} (+/- {scores.std()*2:.3f})")
    """
    # YOUR CODE HERE
    pass


# ============================================================
# EXERCISE 9: Feature Engineering
# ============================================================
"""
Implement feature engineering utilities.
"""

def polynomial_features(X: np.ndarray, degree: int = 2) -> np.ndarray:
    """
    Generate polynomial features up to given degree.
    
    For degree=2 and single feature x: returns [1, x, x²]
    For degree=2 and features [x1, x2]: returns [1, x1, x2, x1², x1*x2, x2²]
    
    Example:
        >>> X = np.array([[1], [2], [3]])
        >>> polynomial_features(X, degree=2)
        array([[1, 1, 1],
               [1, 2, 4],
               [1, 3, 9]])
    """
    # YOUR CODE HERE (simplified version for single feature)
    pass


def one_hot_encode(arr: np.ndarray) -> np.ndarray:
    """
    One-hot encode a categorical array.
    
    Example:
        >>> arr = np.array([0, 1, 2, 0, 1])
        >>> one_hot_encode(arr)
        array([[1, 0, 0],
               [0, 1, 0],
               [0, 0, 1],
               [1, 0, 0],
               [0, 1, 0]])
    """
    # YOUR CODE HERE
    pass


# ============================================================
# EXERCISE 10: Complete ML Pipeline
# ============================================================
"""
Create a complete ML pipeline class.
"""

class MLPipeline:
    """
    Complete machine learning pipeline.
    
    Steps:
    1. Train/test split
    2. Preprocessing
    3. Model training
    4. Evaluation
    
    Example:
        >>> pipeline = MLPipeline(
        ...     model=SimpleLinearRegression(),
        ...     preprocessor=DataPreprocessor(scaling='standard'),
        ...     test_size=0.2
        ... )
        >>> results = pipeline.run(X, y)
        >>> print(results['test_score'])
    """
    
    def __init__(
        self,
        model,  # Model with fit, predict methods
        preprocessor: Optional[DataPreprocessor] = None,
        test_size: float = 0.2,
        random_state: Optional[int] = None
    ):
        self.model = model
        self.preprocessor = preprocessor
        self.test_size = test_size
        self.random_state = random_state
        
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
    
    def _train_test_split(
        self,
        X: np.ndarray,
        y: np.ndarray
    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        """Split data into train and test sets."""
        # YOUR CODE HERE
        pass
    
    def run(self, X: np.ndarray, y: np.ndarray) -> Dict[str, Any]:
        """
        Run the complete pipeline.
        
        Returns:
            Dictionary with train_score, test_score, predictions
        """
        # YOUR CODE HERE
        pass


# ============================================================
# SOLUTIONS (Uncomment to check your work)
# ============================================================

"""
# Solution 1: NumPy Array Operations
def create_identity_matrix(n: int) -> np.ndarray:
    return np.eye(n)

def normalize_array(arr: np.ndarray) -> np.ndarray:
    return (arr - arr.mean()) / arr.std()

def min_max_scale(arr: np.ndarray) -> np.ndarray:
    return (arr - arr.min()) / (arr.max() - arr.min())


# Solution 2: Matrix Operations
def matrix_multiply(A: np.ndarray, B: np.ndarray) -> np.ndarray:
    return np.dot(A, B)  # or A @ B

def compute_statistics(arr: np.ndarray) -> Dict[str, float]:
    return {
        'mean': float(np.mean(arr)),
        'median': float(np.median(arr)),
        'std': float(np.std(arr)),
        'var': float(np.var(arr)),
        'min': float(np.min(arr)),
        'max': float(np.max(arr)),
        'range': float(np.max(arr) - np.min(arr))
    }


# Solution 3: Data Cleaning
def find_missing_values(data: np.ndarray) -> Tuple[int, np.ndarray]:
    mask = np.isnan(data)
    return int(mask.sum()), np.where(mask)[0]

def fill_missing_with_mean(data: np.ndarray) -> np.ndarray:
    result = data.copy()
    mean_val = np.nanmean(data)
    result[np.isnan(result)] = mean_val
    return result

def remove_outliers(data: np.ndarray, threshold: float = 2.0) -> np.ndarray:
    mean = np.mean(data)
    std = np.std(data)
    mask = np.abs(data - mean) <= threshold * std
    return data[mask]


# Solution 4: Simple Linear Regression
class SimpleLinearRegression:
    def __init__(self):
        self.slope = None
        self.intercept = None
    
    def fit(self, X: np.ndarray, y: np.ndarray):
        X = np.asarray(X).flatten()
        y = np.asarray(y).flatten()
        
        x_mean = X.mean()
        y_mean = y.mean()
        
        numerator = np.sum((X - x_mean) * (y - y_mean))
        denominator = np.sum((X - x_mean) ** 2)
        
        self.slope = numerator / denominator
        self.intercept = y_mean - self.slope * x_mean
    
    def predict(self, X: np.ndarray) -> np.ndarray:
        X = np.asarray(X).flatten()
        return self.slope * X + self.intercept
    
    def r_squared(self, X: np.ndarray, y: np.ndarray) -> float:
        y_pred = self.predict(X)
        ss_res = np.sum((y - y_pred) ** 2)
        ss_tot = np.sum((y - y.mean()) ** 2)
        return 1 - (ss_res / ss_tot)


# Solution 5: KNN
class SimpleKNN:
    def __init__(self, k: int = 3):
        self.k = k
        self.X_train = None
        self.y_train = None
    
    def fit(self, X: np.ndarray, y: np.ndarray):
        self.X_train = np.asarray(X)
        self.y_train = np.asarray(y)
    
    def _euclidean_distance(self, x1: np.ndarray, x2: np.ndarray) -> float:
        return np.sqrt(np.sum((x1 - x2) ** 2))
    
    def _predict_single(self, x: np.ndarray) -> int:
        distances = [self._euclidean_distance(x, x_train) 
                     for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_labels = self.y_train[k_indices]
        # Return most common label
        values, counts = np.unique(k_labels, return_counts=True)
        return values[np.argmax(counts)]
    
    def predict(self, X: np.ndarray) -> np.ndarray:
        X = np.asarray(X)
        return np.array([self._predict_single(x) for x in X])


# Solution 7: Confusion Matrix
def confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    tp = np.sum((y_true == 1) & (y_pred == 1))
    return np.array([[tn, fp], [fn, tp]])

def precision_recall_f1(y_true: np.ndarray, y_pred: np.ndarray) -> Dict[str, float]:
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1]
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


# Solution 9: Feature Engineering
def polynomial_features(X: np.ndarray, degree: int = 2) -> np.ndarray:
    X = np.asarray(X)
    if X.ndim == 1:
        X = X.reshape(-1, 1)
    
    n_samples = X.shape[0]
    result = np.ones((n_samples, 1))  # Start with bias term
    
    for d in range(1, degree + 1):
        result = np.hstack([result, X ** d])
    
    return result

def one_hot_encode(arr: np.ndarray) -> np.ndarray:
    n_samples = len(arr)
    n_classes = len(np.unique(arr))
    result = np.zeros((n_samples, n_classes), dtype=int)
    result[np.arange(n_samples), arr] = 1
    return result
"""


if __name__ == "__main__":
    print("Data Science and Machine Learning Exercises")
    print("=" * 50)
    print("\nComplete the exercises above to practice:")
    print("- NumPy array operations")
    print("- Data preprocessing")
    print("- Building ML algorithms from scratch")
    print("- Evaluation metrics")
    print("- Cross-validation")
    print("- Feature engineering")
    print("\nUncomment the solutions to check your work!")
    
    # Quick tests
    print("\n--- Quick Tests ---")
    
    # Test normalize_array if implemented
    try:
        arr = np.array([10, 20, 30, 40, 50])
        result = normalize_array(arr)
        if result is not None:
            print(f"normalize_array: {result}")
    except:
        pass
    
    print("\nRun this file after implementing the exercises!")