import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
An example of using PCA on a synthetic dataset:
# Authors: Gael Varoquaux
# Jaques Grobler
# Kevin Hughes
# License: BSD 3 clause
%matplotlib notebook
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
# #############################################################################
# Create the data
e = np.exp(1)
np.random.seed(4)
def pdf(x):
return 0.5 * (stats.norm(scale=0.25 / e).pdf(x) + stats.norm(scale=4 / e).pdf(x))
y = np.random.normal(scale=0.5, size=(30000))
x = np.random.normal(scale=0.5, size=(30000))
z = np.random.normal(scale=0.1, size=len(x))
density = pdf(x) * pdf(y)
pdf_z = pdf(5 * z)
density *= pdf_z
a = x + y
b = 2 * y
c = a - b + z
norm = np.sqrt(a.var() + b.var())
a /= norm
b /= norm
# #############################################################################
# Plot the figures
def plot_figs(fig_num, elev, azim):
fig = plt.figure(fig_num, figsize=(6, 5))
plt.clf()
ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=elev, azim=azim)
ax.scatter(a[::10], b[::10], c[::10], c=density[::10], marker="+", alpha=0.4)
Y = np.c_[a, b, c]
pca = PCA(n_components=3)
pca.fit(Y)
V = pca.components_.T
x_pca_axis, y_pca_axis, z_pca_axis = 3 * V
x_pca_plane = np.r_[x_pca_axis[:2], -x_pca_axis[1::-1]]
y_pca_plane = np.r_[y_pca_axis[:2], -y_pca_axis[1::-1]]
z_pca_plane = np.r_[z_pca_axis[:2], -z_pca_axis[1::-1]]
x_pca_plane.shape = (2, 2)
y_pca_plane.shape = (2, 2)
z_pca_plane.shape = (2, 2)
ax.plot_surface(x_pca_plane, y_pca_plane, z_pca_plane)
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_xlabel("x")
ax.set_ylabel("y")
ax.set_zlabel("z")
print("Component Vectors (one per column):")
print(pca.components_.T)
print("Explained variance:")
print(pca.explained_variance_ratio_)
plot_figs(1, 0, 0)
# elev = 30
# azim = 20
# plot_figs(2, elev, azim)
plt.show()
Component Vectors (one per column): [[-0.33847725 -0.7109608 0.61641536] [-0.77400604 -0.1621726 -0.61205775] [ 0.53511475 -0.68427684 -0.49539622]] Explained variance: [0.72408528 0.27427885 0.00163586]
Find clusters of points based on proximity, density, or other similar metrics.
Example algorithm: K means
Demo: https://www.naftaliharris.com/blog/visualizing-k-means-clustering/
penguins = sns.load_dataset("penguins")
sns.relplot(data=penguins, x="flipper_length_mm", y="bill_length_mm")
<seaborn.axisgrid.FacetGrid at 0x1709aad30>
Xdf = penguins[["flipper_length_mm", "bill_length_mm"]].dropna()
X = Xdf.to_numpy()
X.shape
(342, 2)
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
Xdf["cluster"] = kmeans.labels_
Xdf.info()
Xdf
<class 'pandas.core.frame.DataFrame'> Int64Index: 342 entries, 0 to 343 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 flipper_length_mm 342 non-null float64 1 bill_length_mm 342 non-null float64 2 cluster 342 non-null int32 dtypes: float64(2), int32(1) memory usage: 9.4 KB
flipper_length_mm | bill_length_mm | cluster | |
---|---|---|---|
0 | 181.0 | 39.1 | 0 |
1 | 186.0 | 39.5 | 0 |
2 | 195.0 | 40.3 | 2 |
4 | 193.0 | 36.7 | 0 |
5 | 190.0 | 39.3 | 0 |
... | ... | ... | ... |
338 | 214.0 | 47.2 | 1 |
340 | 215.0 | 46.8 | 1 |
341 | 222.0 | 50.4 | 1 |
342 | 212.0 | 45.2 | 1 |
343 | 213.0 | 49.9 | 1 |
342 rows × 3 columns
kmeans.cluster_centers_
array([[186.99166667, 38.4275 ], [216.88372093, 47.56744186], [196.7311828 , 45.95483871]])
fig = plt.figure()
plt.subplot(1,2,1)
sns.scatterplot(data=Xdf, x="flipper_length_mm", y="bill_length_mm", hue="cluster")
plt.scatter(x=kmeans.cluster_centers_[:,0], y=kmeans.cluster_centers_[:,1], c="red", marker="x")
plt.subplot(1,2,2)
sns.scatterplot(data=penguins, x="flipper_length_mm", y="bill_length_mm", hue="species")
plt.show()
An example that ties together clustering and PCA: plot_kmeans_digits.ipynb