import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# Authors: Gael Varoquaux
#          Jaques Grobler
#          Kevin Hughes
# License: BSD 3 clause
%matplotlib notebook
from sklearn.decomposition import PCA

from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats


# #############################################################################
# Create the data

e = np.exp(1)
np.random.seed(4)


def pdf(x):
    return 0.5 * (stats.norm(scale=0.25 / e).pdf(x) + stats.norm(scale=4 / e).pdf(x))


y = np.random.normal(scale=0.5, size=(30000))
x = np.random.normal(scale=0.5, size=(30000))
z = np.random.normal(scale=0.1, size=len(x))

density = pdf(x) * pdf(y)
pdf_z = pdf(5 * z)

density *= pdf_z

a = x + y
b = 2 * y
c = a - b + z

norm = np.sqrt(a.var() + b.var())
a /= norm
b /= norm


# #############################################################################
# Plot the figures
def plot_figs(fig_num, elev, azim):
    fig = plt.figure(fig_num, figsize=(6, 5))
    plt.clf()
    ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=elev, azim=azim)

    ax.scatter(a[::10], b[::10], c[::10], c=density[::10], marker="+", alpha=0.4)
    Y = np.c_[a, b, c]

    pca = PCA(n_components=3)
    pca.fit(Y)
    V = pca.components_.T

    x_pca_axis, y_pca_axis, z_pca_axis = 3 * V
    x_pca_plane = np.r_[x_pca_axis[:2], -x_pca_axis[1::-1]]
    y_pca_plane = np.r_[y_pca_axis[:2], -y_pca_axis[1::-1]]
    z_pca_plane = np.r_[z_pca_axis[:2], -z_pca_axis[1::-1]]
    x_pca_plane.shape = (2, 2)
    y_pca_plane.shape = (2, 2)
    z_pca_plane.shape = (2, 2)
    ax.plot_surface(x_pca_plane, y_pca_plane, z_pca_plane)
    ax.w_xaxis.set_ticklabels([])
    ax.w_yaxis.set_ticklabels([])
    ax.w_zaxis.set_ticklabels([])
    
    ax.set_xlabel("x")
    ax.set_ylabel("y")
    ax.set_zlabel("z")
    print("Component Vectors (one per column):")
    print(pca.components_.T)
    print("Explained variance:")
    print(pca.explained_variance_ratio_)

plot_figs(1, 0, 0)

# elev = 30
# azim = 20
# plot_figs(2, elev, azim)

plt.show()

Component Vectors (one per column):
[[-0.33847725 -0.7109608   0.61641536]
 [-0.77400604 -0.1621726  -0.61205775]
 [ 0.53511475 -0.68427684 -0.49539622]]
Explained variance:
[0.72408528 0.27427885 0.00163586]


penguins = sns.load_dataset("penguins")
sns.relplot(data=penguins, x="flipper_length_mm", y="bill_length_mm")

<seaborn.axisgrid.FacetGrid at 0x1709aad30>


Xdf = penguins[["flipper_length_mm", "bill_length_mm"]].dropna()
X = Xdf.to_numpy()
X.shape

(342, 2)


from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3)
kmeans.fit(X)

Xdf["cluster"] = kmeans.labels_

Xdf.info()
Xdf

<class 'pandas.core.frame.DataFrame'>
Int64Index: 342 entries, 0 to 343
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   flipper_length_mm  342 non-null    float64
 1   bill_length_mm     342 non-null    float64
 2   cluster            342 non-null    int32  
dtypes: float64(2), int32(1)
memory usage: 9.4 KB


kmeans.cluster_centers_

array([[186.99166667,  38.4275    ],
       [216.88372093,  47.56744186],
       [196.7311828 ,  45.95483871]])


fig = plt.figure()
plt.subplot(1,2,1)
sns.scatterplot(data=Xdf, x="flipper_length_mm", y="bill_length_mm", hue="cluster")
plt.scatter(x=kmeans.cluster_centers_[:,0], y=kmeans.cluster_centers_[:,1], c="red", marker="x")
plt.subplot(1,2,2)
sns.scatterplot(data=penguins, x="flipper_length_mm", y="bill_length_mm", hue="species")
plt.show()

	flipper_length_mm	bill_length_mm	cluster
0	181.0	39.1	0
1	186.0	39.5	0
2	195.0	40.3	2
4	193.0	36.7	0
5	190.0	39.3	0
...	...	...	...
338	214.0	47.2	1
340	215.0	46.8	1
341	222.0	50.4	1
342	212.0	45.2	1
343	213.0	49.9	1

Lecture 25 - Clustering; Scikit-Learn Basics¶

Announcements:¶

Goals:¶

Clustering¶