import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
This dataset is from Spotify; it has some of their (presumably machine-learning-derived) attributes ("energy", "liveness", etc), as well as a genre label.
I wanted to do PCA on the attributes, visualize it in 2D, and see if genres were well separated.
url = "https://facultyweb.cs.wwu.edu/~wehrwes/courses/data311_21f/data/genres_v2.csv"
df = pd.read_csv(url, low_memory=False)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 42305 entries, 0 to 42304 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 danceability 42305 non-null float64 1 energy 42305 non-null float64 2 key 42305 non-null int64 3 loudness 42305 non-null float64 4 mode 42305 non-null int64 5 speechiness 42305 non-null float64 6 acousticness 42305 non-null float64 7 instrumentalness 42305 non-null float64 8 liveness 42305 non-null float64 9 valence 42305 non-null float64 10 tempo 42305 non-null float64 11 type 42305 non-null object 12 id 42305 non-null object 13 uri 42305 non-null object 14 track_href 42305 non-null object 15 analysis_url 42305 non-null object 16 duration_ms 42305 non-null int64 17 time_signature 42305 non-null int64 18 genre 42305 non-null object 19 song_name 21519 non-null object 20 Unnamed: 0 20780 non-null float64 21 title 20780 non-null object dtypes: float64(10), int64(4), object(8) memory usage: 7.1+ MB
Grab the numerical attribute columns only:
df_num = df.iloc[:, :11]
df_num.head()
danceability | energy | key | loudness | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.831 | 0.814 | 2 | -7.364 | 1 | 0.4200 | 0.0598 | 0.013400 | 0.0556 | 0.3890 | 156.985 |
1 | 0.719 | 0.493 | 8 | -7.230 | 1 | 0.0794 | 0.4010 | 0.000000 | 0.1180 | 0.1240 | 115.080 |
2 | 0.850 | 0.893 | 5 | -4.783 | 1 | 0.0623 | 0.0138 | 0.000004 | 0.3720 | 0.0391 | 218.050 |
3 | 0.476 | 0.781 | 0 | -4.710 | 1 | 0.1030 | 0.0237 | 0.000000 | 0.1140 | 0.1750 | 186.948 |
4 | 0.798 | 0.624 | 2 | -7.668 | 1 | 0.2930 | 0.2170 | 0.000000 | 0.1660 | 0.5910 | 147.988 |
df_num["mode"].value_counts()
1 23245 0 19060 Name: mode, dtype: int64
I'm not sure what "mode" is, but it only has two values. Let's use only variables that seem truly numerical.
X = df_num.drop(["mode"], axis=1).to_numpy()
Ok, let's some PCAing!
from sklearn.decomposition import PCA
pca = PCA(n_components = X.shape[1])
pca.fit(X)
print(pca.explained_variance_ratio_)
sns.lineplot(x=range(X.shape[1]), y=np.cumsum(pca.explained_variance_ratio_))
[9.62424267e-01 2.27429065e-02 1.43298842e-02 2.47690314e-04 9.10895363e-05 5.53446717e-05 4.07700560e-05 3.17857446e-05 2.10844010e-05 1.51778597e-05]
<AxesSubplot:>
Wow, this is great! Practically all the variance is explained by 2 components. This means we won't lose anything if we plot the first 2 components.
X2d = pca.transform(X)
df["c1"] = X2d[:,0]
df["c2"] = X2d[:,1]
sns.relplot(x="c1", y="c2", hue="genre", data=df)
<seaborn.axisgrid.FacetGrid at 0x168b440d0>
sns.relplot(x="c1", y="c2", row="genre", data=df)
<seaborn.axisgrid.FacetGrid at 0x1688858e0>
Huh. That's a... weird picture.
The fact that there exactly 12 lines is suspicious.
c1 = pca.components_[0,:]
c2 = pca.components_[1,:]
pd.DataFrame([df_num.drop("mode", axis=1).columns, c1, c2]).T
0 | 1 | 2 | |
---|---|---|---|
0 | danceability | 0.001093 | -0.000543 |
1 | energy | 0.000178 | 0.001967 |
2 | key | 0.001555 | 0.999904 |
3 | loudness | -0.01898 | -0.011473 |
4 | speechiness | -0.000872 | -0.001013 |
5 | acousticness | -0.000394 | -0.000089 |
6 | instrumentalness | 0.003235 | 0.006906 |
7 | liveness | -0.000211 | 0.000075 |
8 | valence | -0.000573 | 0.00181 |
9 | tempo | -0.999812 | 0.001795 |
df_num.nunique()
danceability 890 energy 917 key 12 loudness 11654 mode 2 speechiness 1447 acousticness 4602 instrumentalness 4757 liveness 1695 valence 1674 tempo 15606 dtype: int64
sns.relplot(x="c1", y="c2", hue="tempo", data=df)
<seaborn.axisgrid.FacetGrid at 0x168b44910>
fig = plt.figure(figsize=(10, 5))
sns.boxplot(data=df_num)
plt.xticks(rotation=30, ha="right")
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), [Text(0, 0, 'danceability'), Text(1, 0, 'energy'), Text(2, 0, 'key'), Text(3, 0, 'loudness'), Text(4, 0, 'mode'), Text(5, 0, 'speechiness'), Text(6, 0, 'acousticness'), Text(7, 0, 'instrumentalness'), Text(8, 0, 'liveness'), Text(9, 0, 'valence'), Text(10, 0, 'tempo')])
from sklearn.preprocessing import StandardScaler
X = df_num.drop(["mode"], axis=1).to_numpy()
scaler = StandardScaler(copy=True).fit(X)
Xscaled = scaler.transform(X)
pca = PCA(n_components = Xscaled.shape[1])
pca.fit(Xscaled)
pca.explained_variance_ratio_
sns.lineplot(x=range(X.shape[1]), y=np.cumsum(pca.explained_variance_ratio_))
<AxesSubplot:>
fig = plt.figure(figsize=(10, 5))
sns.boxplot(data=Xscaled)
plt.xticks(rotation=30, ha="right")
(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), [Text(0, 0, '0'), Text(1, 0, '1'), Text(2, 0, '2'), Text(3, 0, '3'), Text(4, 0, '4'), Text(5, 0, '5'), Text(6, 0, '6'), Text(7, 0, '7'), Text(8, 0, '8'), Text(9, 0, '9')])
Less convenient, but also less surprising: the intrisnic dimensionality is about 10.
X2d = pca.transform(Xscaled)
df["c1"] = X2d[:,0]
df["c2"] = X2d[:,1]
sns.relplot(x="c1", y="c2", hue="genre", data=df)
<seaborn.axisgrid.FacetGrid at 0x166319400>
c1 = pca.components_[0,:]
c2 = pca.components_[1,:]
pd.DataFrame([df_num.drop("mode", axis=1).columns, c1, c2]).T
0 | 1 | 2 | |
---|---|---|---|
0 | danceability | 0.356769 | 0.05098 |
1 | energy | -0.548753 | -0.189263 |
2 | key | -0.039263 | 0.045507 |
3 | loudness | -0.355553 | -0.486001 |
4 | speechiness | 0.268682 | -0.430117 |
5 | acousticness | 0.427551 | 0.013116 |
6 | instrumentalness | -0.302621 | 0.486642 |
7 | liveness | -0.216143 | -0.229131 |
8 | valence | 0.222488 | -0.335637 |
9 | tempo | 0.028952 | -0.369009 |
Back to the zoo!
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
penguins = sns.load_dataset("penguins").dropna()
penguins
species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | |
---|---|---|---|---|---|---|---|
0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | Male |
1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | Female |
2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | Female |
4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | Female |
5 | Adelie | Torgersen | 39.3 | 20.6 | 190.0 | 3650.0 | Male |
... | ... | ... | ... | ... | ... | ... | ... |
338 | Gentoo | Biscoe | 47.2 | 13.7 | 214.0 | 4925.0 | Female |
340 | Gentoo | Biscoe | 46.8 | 14.3 | 215.0 | 4850.0 | Female |
341 | Gentoo | Biscoe | 50.4 | 15.7 | 222.0 | 5750.0 | Male |
342 | Gentoo | Biscoe | 45.2 | 14.8 | 212.0 | 5200.0 | Female |
343 | Gentoo | Biscoe | 49.9 | 16.1 | 213.0 | 5400.0 | Male |
333 rows × 7 columns
sns.scatterplot(data=penguins, x="bill_length_mm", y="body_mass_g", hue="species")
<AxesSubplot:xlabel='bill_length_mm', ylabel='body_mass_g'>
p = penguins[penguins["species"] != "Chinstrap"]
features = ["bill_length_mm", "body_mass_g"]
X = p[features].to_numpy()
y = p["species"].map({"Adelie": 0, "Gentoo": 1})
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from matplotlib.colors import ListedColormap
def scale_split(ds):
""" Apply standard scaling and split into train/val sets."""
X, y = ds
X = StandardScaler().fit_transform(X)
Xtr, Xva, ytr, yva = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Create a dense grid so we can show the decision boundary/classifier scores
x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02))
return (Xtr, Xva, ytr, yva, xx, yy)
def plot_clf_results(ds, Z, accuracy, ax):
""" Given a trained classifier clf, run it on dataset ds_name and plot
the results in axes ax."""
(Xtr, Xva, ytr, yva, xx, yy) = ds
cm = plt.cm.RdBu
cm_bright = ListedColormap(["#FF0000", "#0000FF"])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
ax.imshow(Z[::-1], cmap=cm, alpha=0.8, extent=[xx.min(), xx.max(), yy.min(), yy.max()])
# Plot the training points
ax.scatter(Xtr[:, 0], Xtr[:, 1], c=ytr, cmap=cm_bright, edgecolors="k")
# Plot the testing points
ax.scatter(Xva[:, 0], Xva[:, 1], c=yva, cmap=cm_bright, edgecolors="w", alpha=0.4)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
acc_str = "%.2f" % (accuracy,)
ax.text(xx.max() - 0.3, yy.min() + 0.3, acc_str, size=15, horizontalalignment="right")
def classify_dataset(clf, ds):
figure = plt.figure(figsize=(12, 5))
(Xtr, Xva, ytr, yva, xx, yy) = ds
## Train and run the model on the validation set:
# clf.fit(Xtr, ytr)
# accuracy = clf.score(Xva, yva)
cv_results = cross_validate(clf, Xtr, ytr, cv=3, return_estimator=True)
accuracy = cv_results["test_score"].mean()
acc_std = cv_results["test_score"].std()
for i, fitted_clf in enumerate(cv_results["estimator"]):
# Also generate a decision score for every point in the 2D plot we're making
# so we can visualize the decision boundary
if hasattr(clf, "decision_function"):
# some classifiers give a "decision function" that outputs a score
Z = fitted_clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
else:
# others output a probability of each class
Z = fitted_clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
## The Plotting Part:
ax = plt.subplot(1, 3, i+1)
plot_clf_results(ds, Z, accuracy, ax)
plt.tight_layout()
plt.show()
Xtr, Xte, ytr, yte, xx, yy = scale_split((X, y))
cv_results = cross_validate(KNeighborsClassifier(3), Xtr, ytr, cv=3, return_estimator=True)
cv_results
{'fit_time': array([0.00140095, 0.00096202, 0.00089908]), 'score_time': array([0.0041461 , 0.00303006, 0.00434685]), 'estimator': [KNeighborsClassifier(n_neighbors=3), KNeighborsClassifier(n_neighbors=3), KNeighborsClassifier(n_neighbors=3)], 'test_score': array([0.94366197, 0.94366197, 0.95714286])}
classify_dataset(SVC(kernel="linear", C=1), (Xtr, Xte, ytr, yte, xx, yy))
The above plot shows the classification results for each of the 3 different "folds" of training. Notice that the accuracy is similar but the decision boundaries are slightly different because of the particulars of which points were in the training vs validation set.
# The zoo, for reference - try out different classifiers
# in the cell above!
classifiers = {
"Nearest Neighbors": KNeighborsClassifier(3),
"Logistic Regression": LogisticRegression(),
"Linear SVM": SVC(kernel="linear", C=0.025),
"RBF SVM": SVC(gamma=2, C=1),
"Decision Tree": DecisionTreeClassifier(max_depth=5),
"Random Forest": RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
"Neural Network": MLPClassifier((20, 100), alpha=1, max_iter=1000),
"Naive Bayes": GaussianNB(),
}