python
examples

examples.py🐍python
"""
Python Data Science and Machine Learning - Examples
Comprehensive demonstrations using NumPy, Pandas, Matplotlib, and Scikit-learn
"""

import numpy as np
import warnings
warnings.filterwarnings('ignore')


# ============================================================
# NUMPY FUNDAMENTALS
# ============================================================

print("=" * 60)
print("NUMPY FUNDAMENTALS")
print("=" * 60)

# --- Creating Arrays ---
print("\n--- Creating Arrays ---")

# From Python lists
arr1d = np.array([1, 2, 3, 4, 5])
arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

print(f"1D Array: {arr1d}")
print(f"2D Array:\n{arr2d}")
print(f"Shape: {arr2d.shape}, Dtype: {arr2d.dtype}")

# Special arrays
zeros = np.zeros((2, 3))
ones = np.ones((2, 3))
identity = np.eye(3)
range_arr = np.arange(0, 10, 2)
linspace = np.linspace(0, 1, 5)

print(f"\nZeros:\n{zeros}")
print(f"Identity:\n{identity}")
print(f"Range: {range_arr}")
print(f"Linspace: {linspace}")

# Random arrays
np.random.seed(42)
random_uniform = np.random.rand(3)       # [0, 1) uniform
random_normal = np.random.randn(3)       # Normal distribution
random_int = np.random.randint(1, 10, 5) # Random integers

print(f"\nRandom uniform: {random_uniform}")
print(f"Random normal: {random_normal}")
print(f"Random integers: {random_int}")


# --- Array Operations ---
print("\n--- Array Operations (Vectorized) ---")

a = np.array([1, 2, 3, 4, 5])
b = np.array([10, 20, 30, 40, 50])

print(f"a = {a}")
print(f"b = {b}")
print(f"a + b = {a + b}")
print(f"a * b = {a * b}")
print(f"a ** 2 = {a ** 2}")
print(f"np.sqrt(a) = {np.sqrt(a)}")

# Aggregations
print(f"\nsum: {a.sum()}, mean: {a.mean():.2f}, std: {a.std():.2f}")
print(f"min: {a.min()}, max: {a.max()}")
print(f"cumsum: {np.cumsum(a)}")


# --- Indexing and Slicing ---
print("\n--- Indexing and Slicing ---")

arr = np.array([[1, 2, 3, 4],
                [5, 6, 7, 8],
                [9, 10, 11, 12]])

print(f"Array:\n{arr}")
print(f"arr[0, 0] = {arr[0, 0]}")
print(f"arr[1, 2] = {arr[1, 2]}")
print(f"First row: {arr[0, :]}")
print(f"First column: {arr[:, 0]}")
print(f"Subarray [0:2, 1:3]:\n{arr[0:2, 1:3]}")


# --- Boolean Indexing ---
print("\n--- Boolean Indexing ---")

data = np.array([1, 5, 3, 8, 2, 9, 4, 7])
print(f"Data: {data}")
print(f"data > 5: {data > 5}")
print(f"data[data > 5] = {data[data > 5]}")


# --- Broadcasting ---
print("\n--- Broadcasting ---")

# Different shapes automatically align
matrix = np.array([[1, 2, 3],
                   [4, 5, 6],
                   [7, 8, 9]])
row = np.array([10, 20, 30])
col = np.array([[100], [200], [300]])

print(f"Matrix:\n{matrix}")
print(f"Matrix + row:\n{matrix + row}")
print(f"Matrix + col:\n{matrix + col}")


# --- Reshaping ---
print("\n--- Reshaping ---")

arr = np.arange(12)
print(f"Original: {arr}")
reshaped = arr.reshape(3, 4)
print(f"Reshaped (3, 4):\n{reshaped}")
print(f"Transposed:\n{reshaped.T}")
print(f"Flattened: {reshaped.flatten()}")


# ============================================================
# PANDAS DATA MANIPULATION
# ============================================================

print("\n" + "=" * 60)
print("PANDAS DATA MANIPULATION")
print("=" * 60)

# Import pandas (handle if not installed)
try:
    import pandas as pd
    PANDAS_AVAILABLE = True
except ImportError:
    PANDAS_AVAILABLE = False
    print("Pandas not installed. Install with: pip install pandas")


if PANDAS_AVAILABLE:
    # --- Series ---
    print("\n--- Series ---")
    
    s = pd.Series([10, 20, 30, 40, 50], index=['a', 'b', 'c', 'd', 'e'])
    print(f"Series:\n{s}")
    print(f"s['c'] = {s['c']}")
    print(f"s > 25:\n{s[s > 25]}")
    
    
    # --- DataFrame Creation ---
    print("\n--- DataFrame Creation ---")
    
    # From dictionary
    data = {
        'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
        'age': [25, 30, 35, 28, 32],
        'city': ['NYC', 'LA', 'Chicago', 'NYC', 'LA'],
        'salary': [50000, 60000, 75000, 55000, 65000]
    }
    df = pd.DataFrame(data)
    print(f"DataFrame:\n{df}")
    print(f"\nInfo:")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(f"Dtypes:\n{df.dtypes}")
    
    
    # --- Data Selection ---
    print("\n--- Data Selection ---")
    
    print(f"df['name']:\n{df['name']}")
    print(f"\ndf[['name', 'salary']]:\n{df[['name', 'salary']]}")
    print(f"\ndf.loc[0] (first row):\n{df.loc[0]}")
    print(f"\ndf.iloc[0:2] (first 2 rows):\n{df.iloc[0:2]}")
    print(f"\ndf.loc[1, 'salary'] = {df.loc[1, 'salary']}")
    
    
    # --- Filtering ---
    print("\n--- Filtering ---")
    
    # Boolean conditions
    high_salary = df[df['salary'] > 55000]
    print(f"Salary > 55000:\n{high_salary}")
    
    nyc_young = df[(df['city'] == 'NYC') & (df['age'] < 30)]
    print(f"\nNYC and age < 30:\n{nyc_young}")
    
    
    # --- Adding and Modifying Columns ---
    print("\n--- Adding and Modifying Columns ---")
    
    df['bonus'] = df['salary'] * 0.1
    df['total_comp'] = df['salary'] + df['bonus']
    df['age_group'] = df['age'].apply(lambda x: 'Young' if x < 30 else 'Senior')
    print(f"With new columns:\n{df}")
    
    
    # --- Grouping and Aggregation ---
    print("\n--- Grouping and Aggregation ---")
    
    # Group by single column
    by_city = df.groupby('city')['salary'].agg(['mean', 'sum', 'count'])
    print(f"Salary by city:\n{by_city}")
    
    # Multiple aggregations
    agg_result = df.groupby('city').agg({
        'salary': ['mean', 'max'],
        'age': 'mean'
    })
    print(f"\nMultiple aggregations:\n{agg_result}")
    
    
    # --- Sorting ---
    print("\n--- Sorting ---")
    
    sorted_df = df.sort_values('salary', ascending=False)
    print(f"Sorted by salary (desc):\n{sorted_df[['name', 'salary']]}")
    
    
    # --- Missing Data ---
    print("\n--- Missing Data ---")
    
    df_missing = pd.DataFrame({
        'A': [1, 2, np.nan, 4],
        'B': [5, np.nan, np.nan, 8],
        'C': [9, 10, 11, 12]
    })
    print(f"With missing data:\n{df_missing}")
    print(f"\nMissing count:\n{df_missing.isna().sum()}")
    
    # Fill missing values
    filled = df_missing.fillna(df_missing.mean())
    print(f"\nFilled with mean:\n{filled}")
    
    
    # --- Merge Example ---
    print("\n--- Merge Example ---")
    
    df1 = pd.DataFrame({'key': ['A', 'B', 'C'], 'value1': [1, 2, 3]})
    df2 = pd.DataFrame({'key': ['A', 'B', 'D'], 'value2': [4, 5, 6]})
    
    print(f"df1:\n{df1}")
    print(f"\ndf2:\n{df2}")
    
    merged = pd.merge(df1, df2, on='key', how='outer')
    print(f"\nOuter merge:\n{merged}")


# ============================================================
# MATPLOTLIB VISUALIZATION
# ============================================================

print("\n" + "=" * 60)
print("MATPLOTLIB VISUALIZATION")
print("=" * 60)

try:
    import matplotlib
    matplotlib.use('Agg')  # Non-interactive backend for scripts
    import matplotlib.pyplot as plt
    MATPLOTLIB_AVAILABLE = True
except ImportError:
    MATPLOTLIB_AVAILABLE = False
    print("Matplotlib not installed. Install with: pip install matplotlib")


if MATPLOTLIB_AVAILABLE:
    print("\n--- Creating Plots (saved to files) ---")
    
    # Line Plot
    x = np.linspace(0, 2 * np.pi, 100)
    y_sin = np.sin(x)
    y_cos = np.cos(x)
    
    plt.figure(figsize=(10, 6))
    plt.plot(x, y_sin, label='sin(x)', color='blue', linewidth=2)
    plt.plot(x, y_cos, label='cos(x)', color='red', linewidth=2, linestyle='--')
    plt.xlabel('x')
    plt.ylabel('y')
    plt.title('Trigonometric Functions')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.savefig('line_plot.png', dpi=100, bbox_inches='tight')
    plt.close()
    print("Created: line_plot.png")
    
    # Scatter Plot
    np.random.seed(42)
    x = np.random.rand(50)
    y = 2 * x + 0.5 + np.random.randn(50) * 0.2
    
    plt.figure(figsize=(8, 6))
    plt.scatter(x, y, c=x, cmap='viridis', s=100, alpha=0.7)
    plt.colorbar(label='x value')
    plt.xlabel('x')
    plt.ylabel('y')
    plt.title('Scatter Plot with Color Mapping')
    plt.savefig('scatter_plot.png', dpi=100, bbox_inches='tight')
    plt.close()
    print("Created: scatter_plot.png")
    
    # Bar Plot
    categories = ['A', 'B', 'C', 'D', 'E']
    values = [23, 45, 56, 78, 32]
    
    plt.figure(figsize=(8, 6))
    bars = plt.bar(categories, values, color='steelblue', edgecolor='black')
    plt.bar_label(bars)
    plt.xlabel('Category')
    plt.ylabel('Value')
    plt.title('Bar Chart')
    plt.savefig('bar_plot.png', dpi=100, bbox_inches='tight')
    plt.close()
    print("Created: bar_plot.png")
    
    # Histogram
    data = np.random.randn(1000)
    
    plt.figure(figsize=(8, 6))
    plt.hist(data, bins=30, color='green', alpha=0.7, edgecolor='black')
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.title('Histogram of Normal Distribution')
    plt.axvline(data.mean(), color='red', linestyle='--', label=f'Mean: {data.mean():.2f}')
    plt.legend()
    plt.savefig('histogram.png', dpi=100, bbox_inches='tight')
    plt.close()
    print("Created: histogram.png")
    
    # Subplots
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    
    # Plot 1: Line
    x = np.linspace(0, 10, 100)
    axes[0, 0].plot(x, np.sin(x))
    axes[0, 0].set_title('Line Plot')
    
    # Plot 2: Scatter
    axes[0, 1].scatter(np.random.rand(30), np.random.rand(30))
    axes[0, 1].set_title('Scatter Plot')
    
    # Plot 3: Bar
    axes[1, 0].bar(['A', 'B', 'C'], [10, 20, 15])
    axes[1, 0].set_title('Bar Plot')
    
    # Plot 4: Pie
    axes[1, 1].pie([30, 20, 25, 25], labels=['A', 'B', 'C', 'D'], autopct='%1.1f%%')
    axes[1, 1].set_title('Pie Chart')
    
    plt.tight_layout()
    plt.savefig('subplots.png', dpi=100, bbox_inches='tight')
    plt.close()
    print("Created: subplots.png")


# ============================================================
# SCIKIT-LEARN MACHINE LEARNING
# ============================================================

print("\n" + "=" * 60)
print("SCIKIT-LEARN MACHINE LEARNING")
print("=" * 60)

try:
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import LinearRegression, LogisticRegression
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
    from sklearn.metrics import classification_report, confusion_matrix
    from sklearn.pipeline import Pipeline
    SKLEARN_AVAILABLE = True
except ImportError:
    SKLEARN_AVAILABLE = False
    print("Scikit-learn not installed. Install with: pip install scikit-learn")


if SKLEARN_AVAILABLE:
    # --- Linear Regression ---
    print("\n--- Linear Regression Example ---")
    
    # Generate sample data
    np.random.seed(42)
    X = np.random.rand(100, 1) * 10  # Feature
    y = 2.5 * X.flatten() + 5 + np.random.randn(100) * 2  # Target
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Train model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    
    # Evaluate
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"True equation: y = 2.5x + 5 (+ noise)")
    print(f"Learned: y = {model.coef_[0]:.2f}x + {model.intercept_:.2f}")
    print(f"MSE: {mse:.4f}")
    print(f"R² Score: {r2:.4f}")
    
    
    # --- Classification Example ---
    print("\n--- Classification Example (Iris-like) ---")
    
    # Create sample classification data
    np.random.seed(42)
    n_samples = 150
    
    # Three classes with different means
    class0 = np.random.randn(n_samples // 3, 2) + [0, 0]
    class1 = np.random.randn(n_samples // 3, 2) + [3, 3]
    class2 = np.random.randn(n_samples // 3, 2) + [0, 6]
    
    X = np.vstack([class0, class1, class2])
    y = np.array([0] * 50 + [1] * 50 + [2] * 50)
    
    # Shuffle
    indices = np.random.permutation(n_samples)
    X, y = X[indices], y[indices]
    
    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")
    
    # Try different classifiers
    classifiers = {
        'Logistic Regression': LogisticRegression(max_iter=200),
        'Decision Tree': DecisionTreeClassifier(max_depth=3, random_state=42),
        'Random Forest': RandomForestClassifier(n_estimators=10, random_state=42)
    }
    
    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"{name}: Accuracy = {accuracy:.2%}")
    
    # Detailed report for best model
    print("\nDetailed Classification Report (Random Forest):")
    rf = classifiers['Random Forest']
    y_pred = rf.predict(X_test)
    print(classification_report(y_test, y_pred, target_names=['Class 0', 'Class 1', 'Class 2']))
    
    
    # --- Preprocessing Pipeline ---
    print("\n--- Preprocessing Pipeline ---")
    
    # Create pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(max_iter=200))
    ])
    
    # Fit and predict
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    print(f"Pipeline accuracy: {accuracy_score(y_test, y_pred):.2%}")
    
    
    # --- Feature Scaling Comparison ---
    print("\n--- Feature Scaling Importance ---")
    
    # Create data with different scales
    X_unscaled = np.column_stack([
        np.random.rand(100) * 1000,    # Feature 1: 0-1000
        np.random.rand(100) * 0.01     # Feature 2: 0-0.01
    ])
    y_binary = (X_unscaled[:, 0] + X_unscaled[:, 1] * 50000 > 500).astype(int)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X_unscaled, y_binary, test_size=0.2, random_state=42
    )
    
    # Without scaling
    lr_unscaled = LogisticRegression(max_iter=1000)
    lr_unscaled.fit(X_train, y_train)
    acc_unscaled = accuracy_score(y_test, lr_unscaled.predict(X_test))
    
    # With scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    lr_scaled = LogisticRegression(max_iter=1000)
    lr_scaled.fit(X_train_scaled, y_train)
    acc_scaled = accuracy_score(y_test, lr_scaled.predict(X_test_scaled))
    
    print(f"Without scaling: {acc_unscaled:.2%}")
    print(f"With scaling: {acc_scaled:.2%}")


# ============================================================
# COMPLETE ML WORKFLOW EXAMPLE
# ============================================================

if SKLEARN_AVAILABLE and PANDAS_AVAILABLE:
    print("\n" + "=" * 60)
    print("COMPLETE ML WORKFLOW")
    print("=" * 60)
    
    # 1. Create synthetic dataset
    print("\n1. Creating Dataset...")
    np.random.seed(42)
    n = 200
    
    data = {
        'age': np.random.randint(18, 65, n),
        'income': np.random.randint(20000, 150000, n),
        'education_years': np.random.randint(8, 20, n),
        'hours_per_week': np.random.randint(20, 60, n),
    }
    df = pd.DataFrame(data)
    
    # Target: high earner based on income
    df['high_earner'] = (df['income'] > 70000).astype(int)
    
    print(f"Dataset shape: {df.shape}")
    print(f"Target distribution:\n{df['high_earner'].value_counts()}")
    
    # 2. Prepare features and target
    print("\n2. Preparing Features...")
    X = df.drop('high_earner', axis=1)
    y = df['high_earner']
    
    # 3. Split data
    print("\n3. Splitting Data...")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    print(f"Train: {len(X_train)}, Test: {len(X_test)}")
    
    # 4. Create and train pipeline
    print("\n4. Training Model...")
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier(
            n_estimators=50,
            max_depth=5,
            random_state=42
        ))
    ])
    
    pipeline.fit(X_train, y_train)
    
    # 5. Evaluate
    print("\n5. Evaluating Model...")
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
    
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2%}")
    print(f"\nConfusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # 6. Feature Importance
    print("\n6. Feature Importance:")
    rf = pipeline.named_steps['classifier']
    importance = pd.DataFrame({
        'feature': X.columns,
        'importance': rf.feature_importances_
    }).sort_values('importance', ascending=False)
    print(importance)
    
    # 7. Make predictions on new data
    print("\n7. Making Predictions on New Data...")
    new_data = pd.DataFrame({
        'age': [35, 50],
        'income': [80000, 45000],  # This won't be used for prediction
        'education_years': [16, 12],
        'hours_per_week': [45, 35]
    })
    
    # Remove income from features (it's what we're predicting)
    new_X = new_data[['age', 'education_years', 'hours_per_week']]
    # Need to add income back just for the model (bad design, just for demo)
    new_X_full = new_data.drop(columns=['income'])
    
    print("New data predictions would go here...")
    print("(Model trained on different features for simplicity)")


print("\n" + "=" * 60)
print("All examples completed!")
print("=" * 60)
print("\nInstall packages for full functionality:")
print("  pip install numpy pandas matplotlib scikit-learn")