import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


penguins = sns.load_dataset("penguins")
sns.relplot(data=penguins, x="flipper_length_mm", y="bill_length_mm")

<seaborn.axisgrid.FacetGrid at 0x7facae30b4f0>


X = penguins[["flipper_length_mm", "bill_length_mm"]].to_numpy()
X.shape

(344, 2)


X[:10,:]

array([[181. ,  39.1],
       [186. ,  39.5],
       [195. ,  40.3],
       [  nan,   nan],
       [193. ,  36.7],
       [190. ,  39.3],
       [181. ,  38.9],
       [195. ,  39.2],
       [193. ,  34.1],
       [190. ,  42. ]])


penguins["species"].value_counts()

Adelie       152
Gentoo       124
Chinstrap     68
Name: species, dtype: int64


y = penguins["species"].map({"Gentoo": 1, "Adelie": 2, "Chinstrap": 3}).to_numpy()
y.shape

(344,)

y

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])


# Authors: Gael Varoquaux
#          Jaques Grobler
#          Kevin Hughes
# Adapted by Scott Wehrwein for DATA 311
# License: BSD 3 clause
%matplotlib inline
import sklearn
import sklearn.decomposition

from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats


# #############################################################################
# Create the data

e = np.exp(1)
np.random.seed(4)


def pdf(x):
    return 0.5 * (stats.norm(scale=0.25 / e).pdf(x) + stats.norm(scale=4 / e).pdf(x))


y = np.random.normal(scale=0.5, size=(30000))
x = np.random.normal(scale=0.5, size=(30000))
z = np.random.normal(scale=0.1, size=len(x))

density = pdf(x) * pdf(y)
pdf_z = pdf(5 * z)

density *= pdf_z

a = x + y
b = 2 * y
c = a - b + z

norm = np.sqrt(a.var() + b.var())
a /= norm
b /= norm


# #############################################################################
# Do PCA and plot a figure showing the data and the plane spanned by the first 2
# PCs
def plot_figs(fig_num, elev, azim):
    Y = np.c_[a, b, c]

    pca = sklearn.decomposition.PCA(n_components=3)
    pca.fit(Y)
    V = pca.components_.T

    # from here on is just plotting stuff:
    fig = plt.figure(fig_num, figsize=(8, 5))
    plt.clf()
    ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=elev, azim=azim)

    ax.scatter(a[::10], b[::10], c[::10], c=density[::10], marker="+", alpha=0.4)


    x_pca_axis, y_pca_axis, z_pca_axis = 3 * V
    x_pca_plane = np.r_[x_pca_axis[:2], -x_pca_axis[1::-1]]
    y_pca_plane = np.r_[y_pca_axis[:2], -y_pca_axis[1::-1]]
    z_pca_plane = np.r_[z_pca_axis[:2], -z_pca_axis[1::-1]]
    x_pca_plane.shape = (2, 2)
    y_pca_plane.shape = (2, 2)
    z_pca_plane.shape = (2, 2)
    ax.plot_surface(x_pca_plane, y_pca_plane, z_pca_plane)
    ax.w_xaxis.set_ticklabels([])
    ax.w_yaxis.set_ticklabels([])
    ax.w_zaxis.set_ticklabels([])
    
    ax.set_xlabel("x")
    ax.set_ylabel("y")
    ax.set_zlabel("z")
    print("Component Vectors (one per column):")
    print(pca.components_.T)
    print("Explained variance:")
    print(pca.explained_variance_)


plot_figs(1, 0, 0)

Component Vectors (one per column):
[[-0.33847725 -0.7109608   0.61641536]
 [-0.77400604 -0.1621726  -0.61205775]
 [ 0.53511475 -0.68427684 -0.49539622]]
Explained variance:
[1.0908032  0.41318925 0.00246436]


elev = 30
azim = 20
plot_figs(2, elev, azim)

plt.show()

Component Vectors (one per column):
[[-0.33847725 -0.7109608   0.61641536]
 [-0.77400604 -0.1621726  -0.61205775]
 [ 0.53511475 -0.68427684 -0.49539622]]
Explained variance:
[1.0908032  0.41318925 0.00246436]


penguins = sns.load_dataset("penguins")
sns.relplot(data=penguins, x="flipper_length_mm", y="bill_length_mm")

<seaborn.axisgrid.FacetGrid at 0x7faca7787cd0>


Xdf = penguins[["flipper_length_mm", "bill_length_mm"]].dropna()
X = Xdf.to_numpy()
X.shape

(342, 2)


from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3)
kmeans.fit(X)

Xdf["cluster"] = kmeans.labels_

Xdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 342 entries, 0 to 343
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   flipper_length_mm  342 non-null    float64
 1   bill_length_mm     342 non-null    float64
 2   cluster            342 non-null    int32  
dtypes: float64(2), int32(1)
memory usage: 9.4 KB


kmeans.cluster_centers_

array([[196.7311828 ,  45.95483871],
       [216.88372093,  47.56744186],
       [186.99166667,  38.4275    ]])


fig = plt.figure()
plt.subplot(1,2,1)
sns.scatterplot(data=Xdf, x="flipper_length_mm", y="bill_length_mm", hue="cluster")
plt.scatter(x=kmeans.cluster_centers_[:,0], y=kmeans.cluster_centers_[:,1], c="red", marker="x")
plt.subplot(1,2,2)
sns.scatterplot(data=penguins, x="flipper_length_mm", y="bill_length_mm", hue="species")
plt.show()


sns.relplot(data=penguins, x="flipper_length_mm", y="bill_length_mm", hue="species")

Lecture 19 - Distance Metrics, Dimensionality Reduction, and Clustering¶

Announcements:¶

Goals:¶

Feature Extraction¶

Unsupervised Learning¶

Thinking in High-Dimensional Space¶

Dimensionality Reduction¶

Principal Components Analysis¶

Random Projections¶

Distance Metrics¶

$L^p$ Distances¶

Hamming Distance¶

Cosine Similarity¶

The curse of dimensionality, very briefly¶

Principal Components Analysis¶

Clustering¶