python
examples
examples.py🐍python
"""
Python Data Science and Machine Learning - Examples
Comprehensive demonstrations using NumPy, Pandas, Matplotlib, and Scikit-learn
"""
import numpy as np
import warnings
warnings.filterwarnings('ignore')
# ============================================================
# NUMPY FUNDAMENTALS
# ============================================================
print("=" * 60)
print("NUMPY FUNDAMENTALS")
print("=" * 60)
# --- Creating Arrays ---
print("\n--- Creating Arrays ---")
# From Python lists
arr1d = np.array([1, 2, 3, 4, 5])
arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(f"1D Array: {arr1d}")
print(f"2D Array:\n{arr2d}")
print(f"Shape: {arr2d.shape}, Dtype: {arr2d.dtype}")
# Special arrays
zeros = np.zeros((2, 3))
ones = np.ones((2, 3))
identity = np.eye(3)
range_arr = np.arange(0, 10, 2)
linspace = np.linspace(0, 1, 5)
print(f"\nZeros:\n{zeros}")
print(f"Identity:\n{identity}")
print(f"Range: {range_arr}")
print(f"Linspace: {linspace}")
# Random arrays
np.random.seed(42)
random_uniform = np.random.rand(3) # [0, 1) uniform
random_normal = np.random.randn(3) # Normal distribution
random_int = np.random.randint(1, 10, 5) # Random integers
print(f"\nRandom uniform: {random_uniform}")
print(f"Random normal: {random_normal}")
print(f"Random integers: {random_int}")
# --- Array Operations ---
print("\n--- Array Operations (Vectorized) ---")
a = np.array([1, 2, 3, 4, 5])
b = np.array([10, 20, 30, 40, 50])
print(f"a = {a}")
print(f"b = {b}")
print(f"a + b = {a + b}")
print(f"a * b = {a * b}")
print(f"a ** 2 = {a ** 2}")
print(f"np.sqrt(a) = {np.sqrt(a)}")
# Aggregations
print(f"\nsum: {a.sum()}, mean: {a.mean():.2f}, std: {a.std():.2f}")
print(f"min: {a.min()}, max: {a.max()}")
print(f"cumsum: {np.cumsum(a)}")
# --- Indexing and Slicing ---
print("\n--- Indexing and Slicing ---")
arr = np.array([[1, 2, 3, 4],
[5, 6, 7, 8],
[9, 10, 11, 12]])
print(f"Array:\n{arr}")
print(f"arr[0, 0] = {arr[0, 0]}")
print(f"arr[1, 2] = {arr[1, 2]}")
print(f"First row: {arr[0, :]}")
print(f"First column: {arr[:, 0]}")
print(f"Subarray [0:2, 1:3]:\n{arr[0:2, 1:3]}")
# --- Boolean Indexing ---
print("\n--- Boolean Indexing ---")
data = np.array([1, 5, 3, 8, 2, 9, 4, 7])
print(f"Data: {data}")
print(f"data > 5: {data > 5}")
print(f"data[data > 5] = {data[data > 5]}")
# --- Broadcasting ---
print("\n--- Broadcasting ---")
# Different shapes automatically align
matrix = np.array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
row = np.array([10, 20, 30])
col = np.array([[100], [200], [300]])
print(f"Matrix:\n{matrix}")
print(f"Matrix + row:\n{matrix + row}")
print(f"Matrix + col:\n{matrix + col}")
# --- Reshaping ---
print("\n--- Reshaping ---")
arr = np.arange(12)
print(f"Original: {arr}")
reshaped = arr.reshape(3, 4)
print(f"Reshaped (3, 4):\n{reshaped}")
print(f"Transposed:\n{reshaped.T}")
print(f"Flattened: {reshaped.flatten()}")
# ============================================================
# PANDAS DATA MANIPULATION
# ============================================================
print("\n" + "=" * 60)
print("PANDAS DATA MANIPULATION")
print("=" * 60)
# Import pandas (handle if not installed)
try:
import pandas as pd
PANDAS_AVAILABLE = True
except ImportError:
PANDAS_AVAILABLE = False
print("Pandas not installed. Install with: pip install pandas")
if PANDAS_AVAILABLE:
# --- Series ---
print("\n--- Series ---")
s = pd.Series([10, 20, 30, 40, 50], index=['a', 'b', 'c', 'd', 'e'])
print(f"Series:\n{s}")
print(f"s['c'] = {s['c']}")
print(f"s > 25:\n{s[s > 25]}")
# --- DataFrame Creation ---
print("\n--- DataFrame Creation ---")
# From dictionary
data = {
'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
'age': [25, 30, 35, 28, 32],
'city': ['NYC', 'LA', 'Chicago', 'NYC', 'LA'],
'salary': [50000, 60000, 75000, 55000, 65000]
}
df = pd.DataFrame(data)
print(f"DataFrame:\n{df}")
print(f"\nInfo:")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"Dtypes:\n{df.dtypes}")
# --- Data Selection ---
print("\n--- Data Selection ---")
print(f"df['name']:\n{df['name']}")
print(f"\ndf[['name', 'salary']]:\n{df[['name', 'salary']]}")
print(f"\ndf.loc[0] (first row):\n{df.loc[0]}")
print(f"\ndf.iloc[0:2] (first 2 rows):\n{df.iloc[0:2]}")
print(f"\ndf.loc[1, 'salary'] = {df.loc[1, 'salary']}")
# --- Filtering ---
print("\n--- Filtering ---")
# Boolean conditions
high_salary = df[df['salary'] > 55000]
print(f"Salary > 55000:\n{high_salary}")
nyc_young = df[(df['city'] == 'NYC') & (df['age'] < 30)]
print(f"\nNYC and age < 30:\n{nyc_young}")
# --- Adding and Modifying Columns ---
print("\n--- Adding and Modifying Columns ---")
df['bonus'] = df['salary'] * 0.1
df['total_comp'] = df['salary'] + df['bonus']
df['age_group'] = df['age'].apply(lambda x: 'Young' if x < 30 else 'Senior')
print(f"With new columns:\n{df}")
# --- Grouping and Aggregation ---
print("\n--- Grouping and Aggregation ---")
# Group by single column
by_city = df.groupby('city')['salary'].agg(['mean', 'sum', 'count'])
print(f"Salary by city:\n{by_city}")
# Multiple aggregations
agg_result = df.groupby('city').agg({
'salary': ['mean', 'max'],
'age': 'mean'
})
print(f"\nMultiple aggregations:\n{agg_result}")
# --- Sorting ---
print("\n--- Sorting ---")
sorted_df = df.sort_values('salary', ascending=False)
print(f"Sorted by salary (desc):\n{sorted_df[['name', 'salary']]}")
# --- Missing Data ---
print("\n--- Missing Data ---")
df_missing = pd.DataFrame({
'A': [1, 2, np.nan, 4],
'B': [5, np.nan, np.nan, 8],
'C': [9, 10, 11, 12]
})
print(f"With missing data:\n{df_missing}")
print(f"\nMissing count:\n{df_missing.isna().sum()}")
# Fill missing values
filled = df_missing.fillna(df_missing.mean())
print(f"\nFilled with mean:\n{filled}")
# --- Merge Example ---
print("\n--- Merge Example ---")
df1 = pd.DataFrame({'key': ['A', 'B', 'C'], 'value1': [1, 2, 3]})
df2 = pd.DataFrame({'key': ['A', 'B', 'D'], 'value2': [4, 5, 6]})
print(f"df1:\n{df1}")
print(f"\ndf2:\n{df2}")
merged = pd.merge(df1, df2, on='key', how='outer')
print(f"\nOuter merge:\n{merged}")
# ============================================================
# MATPLOTLIB VISUALIZATION
# ============================================================
print("\n" + "=" * 60)
print("MATPLOTLIB VISUALIZATION")
print("=" * 60)
try:
import matplotlib
matplotlib.use('Agg') # Non-interactive backend for scripts
import matplotlib.pyplot as plt
MATPLOTLIB_AVAILABLE = True
except ImportError:
MATPLOTLIB_AVAILABLE = False
print("Matplotlib not installed. Install with: pip install matplotlib")
if MATPLOTLIB_AVAILABLE:
print("\n--- Creating Plots (saved to files) ---")
# Line Plot
x = np.linspace(0, 2 * np.pi, 100)
y_sin = np.sin(x)
y_cos = np.cos(x)
plt.figure(figsize=(10, 6))
plt.plot(x, y_sin, label='sin(x)', color='blue', linewidth=2)
plt.plot(x, y_cos, label='cos(x)', color='red', linewidth=2, linestyle='--')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Trigonometric Functions')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig('line_plot.png', dpi=100, bbox_inches='tight')
plt.close()
print("Created: line_plot.png")
# Scatter Plot
np.random.seed(42)
x = np.random.rand(50)
y = 2 * x + 0.5 + np.random.randn(50) * 0.2
plt.figure(figsize=(8, 6))
plt.scatter(x, y, c=x, cmap='viridis', s=100, alpha=0.7)
plt.colorbar(label='x value')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Scatter Plot with Color Mapping')
plt.savefig('scatter_plot.png', dpi=100, bbox_inches='tight')
plt.close()
print("Created: scatter_plot.png")
# Bar Plot
categories = ['A', 'B', 'C', 'D', 'E']
values = [23, 45, 56, 78, 32]
plt.figure(figsize=(8, 6))
bars = plt.bar(categories, values, color='steelblue', edgecolor='black')
plt.bar_label(bars)
plt.xlabel('Category')
plt.ylabel('Value')
plt.title('Bar Chart')
plt.savefig('bar_plot.png', dpi=100, bbox_inches='tight')
plt.close()
print("Created: bar_plot.png")
# Histogram
data = np.random.randn(1000)
plt.figure(figsize=(8, 6))
plt.hist(data, bins=30, color='green', alpha=0.7, edgecolor='black')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Histogram of Normal Distribution')
plt.axvline(data.mean(), color='red', linestyle='--', label=f'Mean: {data.mean():.2f}')
plt.legend()
plt.savefig('histogram.png', dpi=100, bbox_inches='tight')
plt.close()
print("Created: histogram.png")
# Subplots
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# Plot 1: Line
x = np.linspace(0, 10, 100)
axes[0, 0].plot(x, np.sin(x))
axes[0, 0].set_title('Line Plot')
# Plot 2: Scatter
axes[0, 1].scatter(np.random.rand(30), np.random.rand(30))
axes[0, 1].set_title('Scatter Plot')
# Plot 3: Bar
axes[1, 0].bar(['A', 'B', 'C'], [10, 20, 15])
axes[1, 0].set_title('Bar Plot')
# Plot 4: Pie
axes[1, 1].pie([30, 20, 25, 25], labels=['A', 'B', 'C', 'D'], autopct='%1.1f%%')
axes[1, 1].set_title('Pie Chart')
plt.tight_layout()
plt.savefig('subplots.png', dpi=100, bbox_inches='tight')
plt.close()
print("Created: subplots.png")
# ============================================================
# SCIKIT-LEARN MACHINE LEARNING
# ============================================================
print("\n" + "=" * 60)
print("SCIKIT-LEARN MACHINE LEARNING")
print("=" * 60)
try:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
SKLEARN_AVAILABLE = True
except ImportError:
SKLEARN_AVAILABLE = False
print("Scikit-learn not installed. Install with: pip install scikit-learn")
if SKLEARN_AVAILABLE:
# --- Linear Regression ---
print("\n--- Linear Regression Example ---")
# Generate sample data
np.random.seed(42)
X = np.random.rand(100, 1) * 10 # Feature
y = 2.5 * X.flatten() + 5 + np.random.randn(100) * 2 # Target
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Train model
model = LinearRegression()
model.fit(X_train, y_train)
# Predict
y_pred = model.predict(X_test)
# Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"True equation: y = 2.5x + 5 (+ noise)")
print(f"Learned: y = {model.coef_[0]:.2f}x + {model.intercept_:.2f}")
print(f"MSE: {mse:.4f}")
print(f"R² Score: {r2:.4f}")
# --- Classification Example ---
print("\n--- Classification Example (Iris-like) ---")
# Create sample classification data
np.random.seed(42)
n_samples = 150
# Three classes with different means
class0 = np.random.randn(n_samples // 3, 2) + [0, 0]
class1 = np.random.randn(n_samples // 3, 2) + [3, 3]
class2 = np.random.randn(n_samples // 3, 2) + [0, 6]
X = np.vstack([class0, class1, class2])
y = np.array([0] * 50 + [1] * 50 + [2] * 50)
# Shuffle
indices = np.random.permutation(n_samples)
X, y = X[indices], y[indices]
# Split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")
# Try different classifiers
classifiers = {
'Logistic Regression': LogisticRegression(max_iter=200),
'Decision Tree': DecisionTreeClassifier(max_depth=3, random_state=42),
'Random Forest': RandomForestClassifier(n_estimators=10, random_state=42)
}
for name, clf in classifiers.items():
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"{name}: Accuracy = {accuracy:.2%}")
# Detailed report for best model
print("\nDetailed Classification Report (Random Forest):")
rf = classifiers['Random Forest']
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=['Class 0', 'Class 1', 'Class 2']))
# --- Preprocessing Pipeline ---
print("\n--- Preprocessing Pipeline ---")
# Create pipeline
pipeline = Pipeline([
('scaler', StandardScaler()),
('classifier', LogisticRegression(max_iter=200))
])
# Fit and predict
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(f"Pipeline accuracy: {accuracy_score(y_test, y_pred):.2%}")
# --- Feature Scaling Comparison ---
print("\n--- Feature Scaling Importance ---")
# Create data with different scales
X_unscaled = np.column_stack([
np.random.rand(100) * 1000, # Feature 1: 0-1000
np.random.rand(100) * 0.01 # Feature 2: 0-0.01
])
y_binary = (X_unscaled[:, 0] + X_unscaled[:, 1] * 50000 > 500).astype(int)
X_train, X_test, y_train, y_test = train_test_split(
X_unscaled, y_binary, test_size=0.2, random_state=42
)
# Without scaling
lr_unscaled = LogisticRegression(max_iter=1000)
lr_unscaled.fit(X_train, y_train)
acc_unscaled = accuracy_score(y_test, lr_unscaled.predict(X_test))
# With scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
lr_scaled = LogisticRegression(max_iter=1000)
lr_scaled.fit(X_train_scaled, y_train)
acc_scaled = accuracy_score(y_test, lr_scaled.predict(X_test_scaled))
print(f"Without scaling: {acc_unscaled:.2%}")
print(f"With scaling: {acc_scaled:.2%}")
# ============================================================
# COMPLETE ML WORKFLOW EXAMPLE
# ============================================================
if SKLEARN_AVAILABLE and PANDAS_AVAILABLE:
print("\n" + "=" * 60)
print("COMPLETE ML WORKFLOW")
print("=" * 60)
# 1. Create synthetic dataset
print("\n1. Creating Dataset...")
np.random.seed(42)
n = 200
data = {
'age': np.random.randint(18, 65, n),
'income': np.random.randint(20000, 150000, n),
'education_years': np.random.randint(8, 20, n),
'hours_per_week': np.random.randint(20, 60, n),
}
df = pd.DataFrame(data)
# Target: high earner based on income
df['high_earner'] = (df['income'] > 70000).astype(int)
print(f"Dataset shape: {df.shape}")
print(f"Target distribution:\n{df['high_earner'].value_counts()}")
# 2. Prepare features and target
print("\n2. Preparing Features...")
X = df.drop('high_earner', axis=1)
y = df['high_earner']
# 3. Split data
print("\n3. Splitting Data...")
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train: {len(X_train)}, Test: {len(X_test)}")
# 4. Create and train pipeline
print("\n4. Training Model...")
pipeline = Pipeline([
('scaler', StandardScaler()),
('classifier', RandomForestClassifier(
n_estimators=50,
max_depth=5,
random_state=42
))
])
pipeline.fit(X_train, y_train)
# 5. Evaluate
print("\n5. Evaluating Model...")
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2%}")
print(f"\nConfusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred))
# 6. Feature Importance
print("\n6. Feature Importance:")
rf = pipeline.named_steps['classifier']
importance = pd.DataFrame({
'feature': X.columns,
'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)
print(importance)
# 7. Make predictions on new data
print("\n7. Making Predictions on New Data...")
new_data = pd.DataFrame({
'age': [35, 50],
'income': [80000, 45000], # This won't be used for prediction
'education_years': [16, 12],
'hours_per_week': [45, 35]
})
# Remove income from features (it's what we're predicting)
new_X = new_data[['age', 'education_years', 'hours_per_week']]
# Need to add income back just for the model (bad design, just for demo)
new_X_full = new_data.drop(columns=['income'])
print("New data predictions would go here...")
print("(Model trained on different features for simplicity)")
print("\n" + "=" * 60)
print("All examples completed!")
print("=" * 60)
print("\nInstall packages for full functionality:")
print(" pip install numpy pandas matplotlib scikit-learn")