Docs

README

Data Science and Machine Learning Fundamentals with Python

This module introduces Python's data science ecosystem and basic machine learning concepts.

Table of Contents

  1. NumPy Fundamentals
  2. Pandas for Data Manipulation
  3. Data Visualization with Matplotlib
  4. Introduction to Machine Learning
  5. Scikit-learn Basics
  6. Best Practices

NumPy Fundamentals

NumPy is the foundation of Python's scientific computing stack.

Why NumPy?

# Standard Python list operations are slow
python_list = list(range(1000000))
# Looping through is O(n) for each operation

# NumPy arrays are fast (vectorized operations)
import numpy as np
numpy_array = np.arange(1000000)
# Operations happen in C, much faster!

Creating Arrays

import numpy as np

# From Python lists
arr1 = np.array([1, 2, 3, 4, 5])
arr2 = np.array([[1, 2, 3], [4, 5, 6]])  # 2D array

# Using built-in functions
zeros = np.zeros((3, 4))        # 3x4 array of zeros
ones = np.ones((2, 3))          # 2x3 array of ones
identity = np.eye(4)            # 4x4 identity matrix
range_arr = np.arange(0, 10, 2) # [0, 2, 4, 6, 8]
linspace = np.linspace(0, 1, 5) # [0, 0.25, 0.5, 0.75, 1]
random = np.random.rand(3, 3)   # Random 3x3 array

# Data types
int_arr = np.array([1, 2, 3], dtype=np.int32)
float_arr = np.array([1, 2, 3], dtype=np.float64)

Array Operations

import numpy as np

a = np.array([1, 2, 3, 4, 5])
b = np.array([10, 20, 30, 40, 50])

# Vectorized arithmetic (element-wise)
print(a + b)      # [11, 22, 33, 44, 55]
print(a * b)      # [10, 40, 90, 160, 250]
print(a ** 2)     # [1, 4, 9, 16, 25]
print(np.sqrt(a)) # [1.0, 1.41, 1.73, 2.0, 2.24]

# Aggregations
print(a.sum())    # 15
print(a.mean())   # 3.0
print(a.std())    # 1.41...
print(a.min())    # 1
print(a.max())    # 5

# Boolean operations
print(a > 2)      # [False, False, True, True, True]
print(a[a > 2])   # [3, 4, 5] - boolean indexing

Array Indexing and Slicing

import numpy as np

arr = np.array([[1, 2, 3, 4],
                [5, 6, 7, 8],
                [9, 10, 11, 12]])

# Basic indexing
print(arr[0, 0])      # 1 (row 0, col 0)
print(arr[1, 2])      # 7 (row 1, col 2)

# Slicing
print(arr[0, :])      # [1, 2, 3, 4] - first row
print(arr[:, 0])      # [1, 5, 9] - first column
print(arr[0:2, 1:3])  # [[2, 3], [6, 7]] - subarray

# Fancy indexing
print(arr[[0, 2], :]) # Rows 0 and 2

Reshaping and Broadcasting

import numpy as np

# Reshaping
arr = np.arange(12)
reshaped = arr.reshape(3, 4)  # 3 rows, 4 columns
flattened = reshaped.flatten() # Back to 1D

# Broadcasting - automatic shape matching
a = np.array([[1], [2], [3]])  # Shape (3, 1)
b = np.array([10, 20, 30])     # Shape (3,)
print(a + b)
# [[11, 21, 31],
#  [12, 22, 32],
#  [13, 23, 33]]

Pandas for Data Manipulation

Pandas provides powerful data structures for data analysis.

Series and DataFrames

import pandas as pd

# Series - 1D labeled array
s = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
print(s['a'])  # 1
print(s[s > 2])  # c, d, e

# DataFrame - 2D labeled data structure
data = {
    'name': ['Alice', 'Bob', 'Charlie'],
    'age': [25, 30, 35],
    'city': ['NYC', 'LA', 'Chicago']
}
df = pd.DataFrame(data)
print(df)
#       name  age     city
# 0    Alice   25      NYC
# 1      Bob   30       LA
# 2  Charlie   35  Chicago

Reading and Writing Data

import pandas as pd

# Reading data
df_csv = pd.read_csv('data.csv')
df_excel = pd.read_excel('data.xlsx')
df_json = pd.read_json('data.json')
df_sql = pd.read_sql('SELECT * FROM table', connection)

# Writing data
df.to_csv('output.csv', index=False)
df.to_excel('output.xlsx', index=False)
df.to_json('output.json')

# Common options
df = pd.read_csv('data.csv',
                 sep=',',           # Delimiter
                 header=0,          # Row to use as header
                 index_col='id',    # Column to use as index
                 usecols=['a','b'], # Columns to read
                 dtype={'a': int},  # Data types
                 parse_dates=['date']) # Parse date columns

Data Selection

import pandas as pd

df = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'C': [7, 8, 9]
}, index=['x', 'y', 'z'])

# Column selection
print(df['A'])        # Series
print(df[['A', 'B']]) # DataFrame

# Row selection
print(df.loc['x'])         # By label
print(df.iloc[0])          # By position
print(df.loc['x':'y'])     # Slice by label
print(df.iloc[0:2])        # Slice by position

# Cell selection
print(df.loc['x', 'A'])    # 1
print(df.iloc[0, 0])       # 1

# Boolean selection
print(df[df['A'] > 1])     # Rows where A > 1
print(df.query('A > 1'))   # Same with query syntax

Data Manipulation

import pandas as pd

df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie', 'Alice'],
    'category': ['A', 'B', 'A', 'B'],
    'value': [100, 200, 150, 175]
})

# Adding columns
df['doubled'] = df['value'] * 2
df['rank'] = df['value'].rank()

# Filtering
filtered = df[df['value'] > 150]

# Sorting
sorted_df = df.sort_values('value', ascending=False)

# Grouping
grouped = df.groupby('category')['value'].mean()
# category
# A    125.0
# B    187.5

# Multiple aggregations
agg = df.groupby('category').agg({
    'value': ['sum', 'mean', 'count'],
    'name': 'nunique'
})

# Pivot tables
pivot = df.pivot_table(
    values='value',
    index='name',
    columns='category',
    aggfunc='sum'
)

Handling Missing Data

import pandas as pd
import numpy as np

df = pd.DataFrame({
    'A': [1, 2, np.nan, 4],
    'B': [5, np.nan, np.nan, 8],
    'C': [9, 10, 11, 12]
})

# Detecting missing data
print(df.isna())          # Boolean mask
print(df.isna().sum())    # Count per column

# Dropping missing data
df.dropna()               # Drop rows with any NaN
df.dropna(how='all')      # Drop rows where all NaN
df.dropna(subset=['A'])   # Drop if A is NaN

# Filling missing data
df.fillna(0)              # Fill with value
df.fillna(method='ffill') # Forward fill
df.fillna(method='bfill') # Backward fill
df['A'].fillna(df['A'].mean())  # Fill with mean

Merging and Joining

import pandas as pd

# Two DataFrames
df1 = pd.DataFrame({'key': ['A', 'B', 'C'], 'value1': [1, 2, 3]})
df2 = pd.DataFrame({'key': ['A', 'B', 'D'], 'value2': [4, 5, 6]})

# Merge (like SQL JOIN)
inner = pd.merge(df1, df2, on='key', how='inner')  # Only matching
left = pd.merge(df1, df2, on='key', how='left')    # All from left
right = pd.merge(df1, df2, on='key', how='right')  # All from right
outer = pd.merge(df1, df2, on='key', how='outer')  # All from both

# Concatenation
combined = pd.concat([df1, df2], ignore_index=True)  # Stack vertically
combined = pd.concat([df1, df2], axis=1)             # Stack horizontally

Data Visualization with Matplotlib

Matplotlib is Python's fundamental plotting library.

Basic Plotting

import matplotlib.pyplot as plt
import numpy as np

# Line plot
x = np.linspace(0, 10, 100)
y = np.sin(x)

plt.figure(figsize=(10, 6))
plt.plot(x, y, label='sin(x)', color='blue', linewidth=2)
plt.xlabel('x')
plt.ylabel('y')
plt.title('Sine Wave')
plt.legend()
plt.grid(True)
plt.savefig('sine_wave.png', dpi=150)
plt.show()

Multiple Plot Types

import matplotlib.pyplot as plt
import numpy as np

fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Line plot
x = np.linspace(0, 10, 100)
axes[0, 0].plot(x, np.sin(x))
axes[0, 0].set_title('Line Plot')

# Scatter plot
x = np.random.rand(50)
y = np.random.rand(50)
colors = np.random.rand(50)
axes[0, 1].scatter(x, y, c=colors, cmap='viridis')
axes[0, 1].set_title('Scatter Plot')

# Bar plot
categories = ['A', 'B', 'C', 'D']
values = [23, 45, 56, 78]
axes[1, 0].bar(categories, values, color='steelblue')
axes[1, 0].set_title('Bar Plot')

# Histogram
data = np.random.randn(1000)
axes[1, 1].hist(data, bins=30, color='green', alpha=0.7)
axes[1, 1].set_title('Histogram')

plt.tight_layout()
plt.show()

Pandas Plotting Integration

import pandas as pd
import matplotlib.pyplot as plt

df = pd.DataFrame({
    'month': ['Jan', 'Feb', 'Mar', 'Apr', 'May'],
    'sales': [100, 120, 150, 130, 170],
    'expenses': [80, 90, 100, 110, 120]
})

# Pandas has built-in plotting
df.plot(x='month', y=['sales', 'expenses'], kind='bar')
plt.title('Monthly Sales vs Expenses')
plt.show()

# Other plot types
df['sales'].plot(kind='line')
df['sales'].plot(kind='hist')
df.plot(kind='scatter', x='sales', y='expenses')

Introduction to Machine Learning

Machine learning enables computers to learn patterns from data.

ML Categories

  1. Supervised Learning: Learn from labeled data

    • Classification: Predict categories (spam/not spam)
    • Regression: Predict continuous values (house prices)
  2. Unsupervised Learning: Find patterns in unlabeled data

    • Clustering: Group similar items
    • Dimensionality Reduction: Simplify data
  3. Reinforcement Learning: Learn through rewards/penalties

The ML Workflow

1. Collect Data
2. Explore & Visualize
3. Prepare Data (clean, transform)
4. Split Data (train/test)
5. Select Model
6. Train Model
7. Evaluate Model
8. Tune Hyperparameters
9. Deploy Model

Scikit-learn Basics

Scikit-learn is Python's premier machine learning library.

Train-Test Split

from sklearn.model_selection import train_test_split

# X = features, y = target
X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
y = [0, 0, 1, 1, 1]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,      # 20% for testing
    random_state=42     # Reproducibility
)

Classification Example

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create and train model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred))

Regression Example

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load dataset (or create sample data)
# boston = load_boston()  # Deprecated
X = np.random.rand(100, 3)  # 100 samples, 3 features
y = 3*X[:, 0] + 2*X[:, 1] + X[:, 2] + np.random.randn(100)*0.1

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print(f"MSE: {mean_squared_error(y_test, y_pred):.4f}")
print(f"R² Score: {r2_score(y_test, y_pred):.4f}")
print(f"Coefficients: {model.coef_}")

Feature Preprocessing

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np

# Numerical scaling
X = np.array([[1, 100], [2, 200], [3, 300]])

# Standardization (mean=0, std=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Normalization (0-1 range)
normalizer = MinMaxScaler()
X_normalized = normalizer.fit_transform(X)

# Categorical encoding
categories = ['cat', 'dog', 'cat', 'bird']

# Label encoding (for ordinal)
le = LabelEncoder()
encoded = le.fit_transform(categories)  # [1, 2, 1, 0]

# One-hot encoding (for nominal)
ohe = OneHotEncoder(sparse=False)
one_hot = ohe.fit_transform(np.array(categories).reshape(-1, 1))

Pipelines

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# Create pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])

# Use like a single model
pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)

# Cross-validation
scores = cross_val_score(pipeline, X, y, cv=5)
print(f"CV Scores: {scores}")
print(f"Mean: {scores.mean():.3f} (+/- {scores.std()*2:.3f})")

Model Selection

from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'classifier__C': [0.1, 1, 10],
    'classifier__max_iter': [100, 200]
}

# Grid search
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='accuracy'
)

grid_search.fit(X_train, y_train)

print(f"Best params: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.3f}")
best_model = grid_search.best_estimator_

Best Practices

Data Science Workflow Tips

# 1. Always explore your data first
df.head()
df.info()
df.describe()
df.isnull().sum()

# 2. Handle missing values appropriately
# - Don't just drop without understanding why
# - Consider imputation strategies

# 3. Visualize distributions
df['column'].hist()
df.boxplot()

# 4. Check for correlations
df.corr()

# 5. Split data BEFORE preprocessing
# Prevents data leakage
X_train, X_test, y_train, y_test = train_test_split(X, y)
scaler.fit(X_train)  # Fit only on training
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

Common Pitfalls to Avoid

# BAD: Fitting scaler on all data (data leakage)
scaler.fit(X)  # Includes test data!

# GOOD: Fit only on training data
scaler.fit(X_train)

# BAD: Not checking for class imbalance
# Model may always predict majority class

# GOOD: Check distribution
print(y.value_counts())

# BAD: Using accuracy for imbalanced data
# 95% accuracy with 95% majority class is useless

# GOOD: Use precision, recall, F1, ROC-AUC
from sklearn.metrics import f1_score, roc_auc_score

Key Libraries Summary

LibraryPurpose
NumPyNumerical computing, arrays
PandasData manipulation, analysis
MatplotlibBasic plotting
SeabornStatistical visualization
Scikit-learnMachine learning
JupyterInteractive notebooks

Next Steps

After mastering these fundamentals:

  1. Deep Learning: TensorFlow, PyTorch
  2. Advanced Visualization: Plotly, Bokeh
  3. Big Data: Dask, PySpark
  4. Natural Language Processing: NLTK, spaCy
  5. Computer Vision: OpenCV, PIL
README - Python Tutorial | DeepML