Machine Learning Fundamentals: From Theory to Practice

Machine Learning Fundamentals: From Theory to Practice

Machine learning has revolutionized how we approach problem-solving in technology. From recommendation systems to autonomous vehicles, ML algorithms are powering innovations across industries. This guide will take you from the basics to implementing your first machine learning models.

Understanding Machine Learning

Machine learning is a subset of artificial intelligence that enables computers to learn and improve from experience without being explicitly programmed. The core idea is to build algorithms that can identify patterns in data and make predictions or decisions.

Types of Machine Learning

Essential Mathematics for ML

Before diving into algorithms, understanding the mathematical foundations is crucial.

Linear Algebra Basics

import numpy as np

# Vector operations
vector_a = np.array([1, 2, 3])
vector_b = np.array([4, 5, 6])

# Dot product
dot_product = np.dot(vector_a, vector_b)
print(f"Dot product: {dot_product}")

# Matrix operations
matrix_a = np.array([[1, 2], [3, 4]])
matrix_b = np.array([[5, 6], [7, 8]])

# Matrix multiplication
product = np.matmul(matrix_a, matrix_b)
print(f"Matrix product:\n{product}")

# Eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(matrix_a)
print(f"Eigenvalues: {eigenvalues}")

Probability and Statistics

import scipy.stats as stats
import matplotlib.pyplot as plt

# Generate normal distribution data
data = np.random.normal(0, 1, 1000)

# Calculate basic statistics
mean = np.mean(data)
std = np.std(data)
print(f"Mean: {mean:.4f}, Standard Deviation: {std:.4f}")

# Hypothesis testing
t_stat, p_value = stats.ttest_1samp(data, 0)
print(f"T-statistic: {t_stat:.4f}, P-value: {p_value:.4f}")

# Plot histogram
plt.hist(data, bins=30, alpha=0.7, edgecolor='black')
plt.axvline(mean, color='red', linestyle='--', label=f'Mean: {mean:.3f}')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Normal Distribution')
plt.legend()
plt.show()

Supervised Learning: Classification

Classification is the task of predicting discrete categories or classes.

K-Nearest Neighbors (KNN)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Generate synthetic classification data
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, 
                          n_clusters_per_class=1, random_state=42)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"KNN Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Find optimal k value
k_values = range(1, 21)
accuracies = []

for k in k_values:
    knn_temp = KNeighborsClassifier(n_neighbors=k)
    knn_temp.fit(X_train, y_train)
    y_pred_temp = knn_temp.predict(X_test)
    accuracies.append(accuracy_score(y_test, y_pred_temp))

optimal_k = k_values[np.argmax(accuracies)]
print(f"\nOptimal k value: {optimal_k}")

Support Vector Machines (SVM)

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Use a 2D dataset for visualization
X_2d, y_2d = make_classification(n_samples=200, n_features=2, n_classes=2, 
                                 n_clusters_per_class=1, random_state=42)

# Scale the features
scaler = StandardScaler()
X_2d_scaled = scaler.fit_transform(X_2d)

# Split data
X_train_2d, X_test_2d, y_train_2d, y_test_2d = train_test_split(
    X_2d_scaled, y_2d, test_size=0.2, random_state=42)

# Train SVM with different kernels
kernels = ['linear', 'rbf', 'poly']
svm_models = {}

for kernel in kernels:
    svm = SVC(kernel=kernel, random_state=42)
    svm.fit(X_train_2d, y_train_2d)
    svm_models[kernel] = svm

# Visualize decision boundaries
def plot_decision_boundary(model, X, y, title):
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                         np.arange(y_min, y_max, 0.02))
    
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    plt.contourf(xx, yy, Z, alpha=0.8)
    plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k', s=20)
    plt.title(title)
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')

plt.figure(figsize=(15, 5))
for i, (kernel, model) in enumerate(svm_models.items()):
    plt.subplot(1, 3, i+1)
    plot_decision_boundary(model, X_2d_scaled, y_2d, f'SVM with {kernel} kernel')
plt.tight_layout()
plt.show()

Supervised Learning: Regression

Regression predicts continuous numerical values rather than discrete classes.

Linear Regression

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Generate synthetic regression data
np.random.seed(42)
X_reg = np.random.rand(100, 1) * 10
y_reg = 2 * X_reg + 1 + np.random.randn(100, 1) * 0.5

# Split data
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42)

# Train linear regression model
lr = LinearRegression()
lr.fit(X_train_reg, y_train_reg)

# Make predictions
y_pred_reg = lr.predict(X_test_reg)

# Evaluate model
mse = mean_squared_error(y_test_reg, y_pred_reg)
r2 = r2_score(y_test_reg, y_pred_reg)

print(f"Linear Regression Results:")
print(f"Mean Squared Error: {mse:.4f}")
print(f"R² Score: {r2:.4f}")
print(f"Slope: {lr.coef_[0][0]:.4f}")
print(f"Intercept: {lr.intercept_[0]:.4f}")

# Visualize results
plt.figure(figsize=(10, 6))
plt.scatter(X_test_reg, y_test_reg, color='blue', label='Actual')
plt.plot(X_test_reg, y_pred_reg, color='red', linewidth=2, label='Predicted')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Linear Regression: Actual vs Predicted')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

Polynomial Regression

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

# Create polynomial features
poly_features = PolynomialFeatures(degree=3, include_bias=False)
X_poly = poly_features.fit_transform(X_reg)

# Split polynomial data
X_train_poly, X_test_poly, y_train_poly, y_test_poly = train_test_split(
    X_poly, y_reg, test_size=0.2, random_state=42)

# Train polynomial regression
poly_reg = LinearRegression()
poly_reg.fit(X_train_poly, y_train_poly)

# Make predictions
y_pred_poly = poly_reg.predict(X_test_poly)

# Evaluate polynomial model
mse_poly = mean_squared_error(y_test_poly, y_pred_poly)
r2_poly = r2_score(y_test_poly, y_pred_poly)

print(f"\nPolynomial Regression Results:")
print(f"Mean Squared Error: {mse_poly:.4f}")
print(f"R² Score: {r2_poly:.4f}")

Unsupervised Learning: Clustering

Clustering algorithms group similar data points together without predefined labels.

K-Means Clustering

from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt

# Generate synthetic clustering data
X_cluster, _ = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=42)

# Find optimal number of clusters using elbow method
inertias = []
K_range = range(1, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_cluster)
    inertias.append(kmeans.inertia_)

# Plot elbow curve
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.plot(K_range, inertias, 'bx-')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')

# Apply K-means with optimal k
optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
clusters = kmeans.fit_predict(X_cluster)

# Visualize clusters
plt.subplot(1, 2, 2)
plt.scatter(X_cluster[:, 0], X_cluster[:, 1], c=clusters, cmap='viridis')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], 
           s=300, c='red', marker='x', linewidths=3, label='Centroids')
plt.title(f'K-means Clustering (k={optimal_k})')
plt.legend()
plt.tight_layout()
plt.show()

Hierarchical Clustering

from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt

# Perform hierarchical clustering
linkage_matrix = linkage(X_cluster, method='ward')

# Plot dendrogram
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
dendrogram(linkage_matrix, truncate_mode='level', p=3)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample Index')
plt.ylabel('Distance')

# Apply hierarchical clustering
hierarchical = AgglomerativeClustering(n_clusters=4)
hierarchical_clusters = hierarchical.fit_predict(X_cluster)

# Visualize hierarchical clusters
plt.subplot(1, 2, 2)
plt.scatter(X_cluster[:, 0], X_cluster[:, 1], c=hierarchical_clusters, cmap='viridis')
plt.title('Hierarchical Clustering')
plt.tight_layout()
plt.show()

Neural Networks and Deep Learning

Neural networks are the foundation of modern deep learning.

Building a Simple Neural Network

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt

# Generate XOR dataset
X_xor = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y_xor = np.array([[0], [1], [1], [0]])

# Build neural network
model = keras.Sequential([
    layers.Dense(4, activation='relu', input_shape=(2,)),
    layers.Dense(4, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

# Compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model
history = model.fit(X_xor, y_xor, epochs=1000, verbose=0)

# Evaluate model
loss, accuracy = model.evaluate(X_xor, y_xor, verbose=0)
print(f"XOR Neural Network Results:")
print(f"Loss: {loss:.4f}")
print(f"Accuracy: {accuracy:.4f}")

# Make predictions
predictions = model.predict(X_xor)
print(f"\nPredictions:")
for i, (x, y_true, y_pred) in enumerate(zip(X_xor, y_xor, predictions)):
    print(f"Input: {x}, True: {y_true[0]}, Predicted: {y_pred[0]:.4f}")

# Plot training history
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'])
plt.title('Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'])
plt.title('Training Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.tight_layout()
plt.show()

Model Evaluation and Validation

Proper evaluation is crucial for building reliable ML models.

Cross-Validation

from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

# Load iris dataset for demonstration
from sklearn.datasets import load_iris
iris = load_iris()
X_iris, y_iris = iris.data, iris.target

# Perform cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)

cv_scores = cross_val_score(rf, X_iris, y_iris, cv=cv, scoring='accuracy')

print(f"Cross-validation Results:")
print(f"Individual fold scores: {cv_scores}")
print(f"Mean CV score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# Compare multiple models
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Logistic Regression': LogisticRegression(random_state=42)
}

print(f"\nModel Comparison:")
for name, model in models.items():
    scores = cross_val_score(model, X_iris, y_iris, cv=cv, scoring='accuracy')
    print(f"{name}: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")

Confusion Matrix and Classification Metrics

from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

# Train a model on the full dataset
rf_full = RandomForestClassifier(n_estimators=100, random_state=42)
rf_full.fit(X_iris, y_iris)

# Make predictions
y_pred_iris = rf_full.predict(X_iris)

# Create confusion matrix
cm = confusion_matrix(y_iris, y_pred_iris)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=iris.target_names, yticklabels=iris.target_names)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Print classification report
print("Classification Report:")
print(classification_report(y_iris, y_pred_iris, target_names=iris.target_names))

Feature Engineering and Selection

Good features are essential for model performance.

Feature Scaling and Normalization

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA

# Compare different scaling methods
scalers = {
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler(),
    'RobustScaler': RobustScaler()
}

# Apply different scalers
scaled_data = {}
for name, scaler in scalers.items():
    scaled_data[name] = scaler.fit_transform(X_iris)

# Visualize the effect of scaling
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes[0, 0].hist(X_iris[:, 0], bins=20, alpha=0.7)
axes[0, 0].set_title('Original Data')

for i, (name, data) in enumerate(scaled_data.items()):
    row = (i + 1) // 2
    col = (i + 1) % 2
    axes[row, col].hist(data[:, 0], bins=20, alpha=0.7)
    axes[row, col].set_title(name)

plt.tight_layout()
plt.show()

# Feature selection using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_iris)

print(f"PCA Results:")
print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Cumulative explained variance: {np.cumsum(pca.explained_variance_ratio_)}")

# Visualize PCA results
plt.figure(figsize=(8, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_iris, cmap='viridis')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
plt.title('PCA: Iris Dataset')
plt.colorbar(scatter)
plt.show()

Practical ML Pipeline

Let's build a complete ML pipeline from data preprocessing to model deployment.

Complete ML Workflow

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd

# Create synthetic dataset with missing values and categorical features
np.random.seed(42)
n_samples = 1000

# Generate features
age = np.random.normal(35, 10, n_samples)
income = np.random.normal(50000, 20000, n_samples)
education = np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], n_samples)
experience = np.random.normal(10, 5, n_samples)

# Create target variable (salary > 60000)
salary_high = (income > 60000).astype(int)

# Add some missing values
age[np.random.choice(n_samples, 50, replace=False)] = np.nan
income[np.random.choice(n_samples, 30, replace=False)] = np.nan

# Create DataFrame
df = pd.DataFrame({
    'age': age,
    'income': income,
    'education': education,
    'experience': experience,
    'salary_high': salary_high
})

print("Dataset Info:")
print(df.info())
print(f"\nMissing values:\n{df.isnull().sum()}")

# Split features and target
X = df.drop('salary_high', axis=1)
y = df['salary_high']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create preprocessing pipeline
numeric_features = ['age', 'income', 'experience']
categorical_features = ['education']

# Numeric preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical preprocessing
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('encoder', LabelEncoder())
])

# Combine preprocessing
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create full pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define parameter grid for hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_split': [2, 5, 10]
}

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print(f"\nBest parameters: {best_params}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

# Evaluate on test set
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test set accuracy: {test_accuracy:.4f}")

# Feature importance (for numeric features)
if hasattr(best_model.named_steps['classifier'], 'feature_importances_'):
    feature_importance = best_model.named_steps['classifier'].feature_importances_
    feature_names = numeric_features + [f"education_{cat}" for cat in 
                                      best_model.named_steps['preprocessor']
                                      .named_transformers_['cat']['encoder'].classes_]
    
    plt.figure(figsize=(10, 6))
    plt.bar(range(len(feature_importance)), feature_importance)
    plt.xticks(range(len(feature_importance)), feature_names, rotation=45)
    plt.title('Feature Importance')
    plt.ylabel('Importance')
    plt.tight_layout()
    plt.show()

Conclusion

The field of machine learning is constantly evolving, so continuous learning and experimentation are essential. Start with simple models and gradually work your way up to more complex algorithms as you gain experience.