import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn

# separate features from labels

beans = pd.read_csv('/cluster/academic/DATA311/202620/drybean.csv')

features = beans.drop(columns="Class")
labels = beans["Class"]

scaler = sklearn.preprocessing.StandardScaler()
scaler.set_output(transform='pandas') # tell it to keep the features as a DataFrame

features = scaler.fit_transform(features)

sns.pairplot(data=features)

<seaborn.axisgrid.PairGrid at 0x14a08f80f560>

# ignore this code for now!
from sklearn.cluster import KMeans

def cluster_demo_fit(X, k=3):
    kmeans = KMeans(n_clusters=k, random_state=42)
    y_pred = kmeans.fit_predict(X)
    return y_pred

def cluster_demo_vis(df, x_col, y_col, k=3):
    
    fig, axes = plt.subplots(1, 2, figsize=(10, 4))
    
    # Left Plot: True Species Labels
    sns.scatterplot( data=df, x=x_col, y=y_col, hue='Class', palette='viridis', ax=axes[0], s=20, alpha=0.8)
    axes[0].set_title('Ground Truth (Actual Species)')
    axes[0].legend(title='Species')
    
    # Right Plot: Predicted Cluster Labels
    sns.scatterplot(data=df, x=x_col, y=y_col, hue='cluster', palette='Set1', ax=axes[1], s=20, alpha=0.8, legend='full')
    axes[1].set_title(f'K-Means Clusters (K={k})')
    axes[1].legend(title='Cluster ID')
    
    plt.show()

# cluster_demo_fit returns a column with the cluster number for each datapoint
K = 7
beans['cluster'] = cluster_demo_fit(features, k=K)

# choose 2 axes to visualize along
x_col = 'AspectRatio'  # column 4
y_col = 'ShapeFactor1' # column 12

# scatterplot
cluster_demo_vis(beans, x_col, y_col, k=K)

# Confusion matrix showing cluster assignment vs species
comparison_table = pd.crosstab(beans['Class'], beans['cluster'])
sns.heatmap(comparison_table, annot=True, fmt='g')

<Axes: xlabel='cluster', ylabel='Class'>

# Visualization code - feel free to ignore
from IPython.display import display, clear_output


def visualize_clusters(X, centroids, labels, iteration, x_index, y_index, title):
    """
    Visualize the current state of clustering.
    
    Args:
        X: Data points (n_samples, n_features)
        centroids: Current centroid positions (k, n_features)
        labels: Cluster assignments for each point (n_samples,)
        iteration: Current iteration number
        title: Plot title
    """
    plt.figure(figsize=(8, 6))
    
    # Plot data points colored by cluster assignment
    if labels is not None:
        for k in range(len(centroids)):
            cluster_points = X[labels == k,:]
            plt.scatter(cluster_points[:, x_index], cluster_points[:, y_index], 
                       s=50, alpha=0.6, label=f'Cluster {k}')
    else:
        # No assignments yet, plot all points in gray
        plt.scatter(X[:, x_index], X[:, y_index], s=50, alpha=0.6, c='gray', label='Unassigned')
    
    # Plot centroids as large stars
    plt.scatter(centroids[:, x_index], centroids[:, y_index], 
               s=300, c='red', marker='*', 
               edgecolors='black', linewidths=2,
               label='Centroids', zorder=10)
    
    plt.title(f'{title} - Iteration {iteration}')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.gcf().savefig(f"kmeans_{iteration:02}.png")

    clear_output(wait=True) 
    display(plt.gcf())
    plt.close()

def kmeans(X, k, x_index=0, y_index=1, max_iterations=100, random_seed=42):
    n_samples, n_features = X.shape
    np.random.seed(random_seed)
    
    # Step 1: Initialize centroids randomly from data points
    random_indices = np.random.choice(n_samples, size=k, replace=False)
    centroids = X[random_indices].copy()
    
    visualize_clusters(X, centroids, None, 0, x_index, y_index, "Initial Random Centroids")

    labels = np.zeros(n_samples, dtype=int)
    # Iterate until convergence or max iterations
    for iteration in range(1, max_iterations + 1):
        
        ### Step 1: Assign each point to nearest centroid ###
        for i in range(n_samples):
            # compute L2 distance to each centroid
            distances = [np.sqrt(np.sum((X[i] - centroid)**2)) for centroid in centroids]

            # assign this point to the closest centroid
            labels[i] = np.argmin(distances)
    
        visualize_clusters(X, centroids, labels, iteration, x_index, y_index, "After Assignment")
        
        ### Step 2: Update centroid locations ###
        new_centroids = np.zeros_like(centroids)
        for cluster_id in range(k):
            cluster_points = X[labels == cluster_id]
            if len(cluster_points) > 0:
                new_centroids[cluster_id,:] = cluster_points.mean(axis=0)
        
        # Check for convergence
        if np.allclose(centroids, new_centroids, rtol=1):
            print(f"Converged after {iteration} iterations")
            centroids
            break
        
        centroids = new_centroids
        
        visualize_clusters(X, centroids, labels, iteration, x_index, y_index, "After Centroid Update")
    
    return centroids, labels

X = features.to_numpy()
centroids, labels = kmeans(X, 7, x_index=4, y_index=12, random_seed=12)

Converged after 11 iterations

beans["cluster"] = labels

comparison_table = pd.crosstab(beans['Class'], beans['cluster'])
sns.heatmap(comparison_table, annot=True, fmt='g')

<Axes: xlabel='cluster', ylabel='Class'>

Lecture 22 - Unsupervised Learning: Clustering¶

Announcements:¶

Goals:¶

ML For Data Science: Unsupervised Learning¶

Clustering¶

Clustering: How?¶