import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


url = "https://facultyweb.cs.wwu.edu/~wehrwes/courses/data311_21f/data/genres_v2.csv"
df = pd.read_csv(url, low_memory=False)


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42305 entries, 0 to 42304
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   danceability      42305 non-null  float64
 1   energy            42305 non-null  float64
 2   key               42305 non-null  int64  
 3   loudness          42305 non-null  float64
 4   mode              42305 non-null  int64  
 5   speechiness       42305 non-null  float64
 6   acousticness      42305 non-null  float64
 7   instrumentalness  42305 non-null  float64
 8   liveness          42305 non-null  float64
 9   valence           42305 non-null  float64
 10  tempo             42305 non-null  float64
 11  type              42305 non-null  object 
 12  id                42305 non-null  object 
 13  uri               42305 non-null  object 
 14  track_href        42305 non-null  object 
 15  analysis_url      42305 non-null  object 
 16  duration_ms       42305 non-null  int64  
 17  time_signature    42305 non-null  int64  
 18  genre             42305 non-null  object 
 19  song_name         21519 non-null  object 
 20  Unnamed: 0        20780 non-null  float64
 21  title             20780 non-null  object 
dtypes: float64(10), int64(4), object(8)
memory usage: 7.1+ MB


df_num = df.iloc[:, :11]
df_num.head()


df_num["mode"].value_counts()

1    23245
0    19060
Name: mode, dtype: int64


X = df_num.drop(["mode"], axis=1).to_numpy()


from sklearn.decomposition import PCA

pca = PCA(n_components = X.shape[1])
pca.fit(X)
print(pca.explained_variance_ratio_)
sns.lineplot(x=range(X.shape[1]), y=np.cumsum(pca.explained_variance_ratio_))

[9.62424267e-01 2.27429065e-02 1.43298842e-02 2.47690314e-04
 9.10895363e-05 5.53446717e-05 4.07700560e-05 3.17857446e-05
 2.10844010e-05 1.51778597e-05]

<AxesSubplot:>


X2d = pca.transform(X)
df["c1"] = X2d[:,0]
df["c2"] = X2d[:,1]
sns.relplot(x="c1", y="c2", hue="genre", data=df)

<seaborn.axisgrid.FacetGrid at 0x168b440d0>


sns.relplot(x="c1", y="c2", row="genre", data=df)

<seaborn.axisgrid.FacetGrid at 0x1688858e0>


c1 = pca.components_[0,:]
c2 = pca.components_[1,:]
pd.DataFrame([df_num.drop("mode", axis=1).columns, c1, c2]).T


df_num.nunique()

danceability          890
energy                917
key                    12
loudness            11654
mode                    2
speechiness          1447
acousticness         4602
instrumentalness     4757
liveness             1695
valence              1674
tempo               15606
dtype: int64


sns.relplot(x="c1", y="c2", hue="tempo", data=df)

<seaborn.axisgrid.FacetGrid at 0x168b44910>


fig = plt.figure(figsize=(10, 5))
sns.boxplot(data=df_num)
plt.xticks(rotation=30, ha="right")

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
 [Text(0, 0, 'danceability'),
  Text(1, 0, 'energy'),
  Text(2, 0, 'key'),
  Text(3, 0, 'loudness'),
  Text(4, 0, 'mode'),
  Text(5, 0, 'speechiness'),
  Text(6, 0, 'acousticness'),
  Text(7, 0, 'instrumentalness'),
  Text(8, 0, 'liveness'),
  Text(9, 0, 'valence'),
  Text(10, 0, 'tempo')])


from sklearn.preprocessing import StandardScaler

X = df_num.drop(["mode"], axis=1).to_numpy()

scaler = StandardScaler(copy=True).fit(X)
Xscaled = scaler.transform(X)

pca = PCA(n_components = Xscaled.shape[1])
pca.fit(Xscaled)
pca.explained_variance_ratio_
sns.lineplot(x=range(X.shape[1]), y=np.cumsum(pca.explained_variance_ratio_))

<AxesSubplot:>


fig = plt.figure(figsize=(10, 5))
sns.boxplot(data=Xscaled)
plt.xticks(rotation=30, ha="right")

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 [Text(0, 0, '0'),
  Text(1, 0, '1'),
  Text(2, 0, '2'),
  Text(3, 0, '3'),
  Text(4, 0, '4'),
  Text(5, 0, '5'),
  Text(6, 0, '6'),
  Text(7, 0, '7'),
  Text(8, 0, '8'),
  Text(9, 0, '9')])


X2d = pca.transform(Xscaled)
df["c1"] = X2d[:,0]
df["c2"] = X2d[:,1]
sns.relplot(x="c1", y="c2", hue="genre", data=df)

<seaborn.axisgrid.FacetGrid at 0x166319400>


c1 = pca.components_[0,:]
c2 = pca.components_[1,:]
pd.DataFrame([df_num.drop("mode", axis=1).columns, c1, c2]).T


from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB


penguins = sns.load_dataset("penguins").dropna()
penguins


sns.scatterplot(data=penguins, x="bill_length_mm", y="body_mass_g", hue="species")

<AxesSubplot:xlabel='bill_length_mm', ylabel='body_mass_g'>


p = penguins[penguins["species"] != "Chinstrap"]


features = ["bill_length_mm", "body_mass_g"]
X = p[features].to_numpy()
y = p["species"].map({"Adelie": 0, "Gentoo": 1})


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from matplotlib.colors import ListedColormap

def scale_split(ds):
    """ Apply standard scaling and split into train/val sets."""
    X, y = ds
    X = StandardScaler().fit_transform(X)
    Xtr, Xva, ytr, yva = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Create a dense grid so we can show the decision boundary/classifier scores
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02))

    return (Xtr, Xva, ytr, yva, xx, yy)


def plot_clf_results(ds, Z, accuracy, ax):
    """ Given a trained classifier clf, run it on dataset ds_name and plot
    the results in axes ax."""

    (Xtr, Xva, ytr, yva, xx, yy) = ds
    
    cm = plt.cm.RdBu
    cm_bright = ListedColormap(["#FF0000", "#0000FF"])
    
    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    ax.imshow(Z[::-1], cmap=cm, alpha=0.8, extent=[xx.min(), xx.max(), yy.min(), yy.max()])

    # Plot the training points
    ax.scatter(Xtr[:, 0], Xtr[:, 1], c=ytr, cmap=cm_bright, edgecolors="k")
    # Plot the testing points
    ax.scatter(Xva[:, 0], Xva[:, 1], c=yva, cmap=cm_bright, edgecolors="w", alpha=0.4)

    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xticks(())
    ax.set_yticks(())
    acc_str = "%.2f" % (accuracy,)
    ax.text(xx.max() - 0.3, yy.min() + 0.3, acc_str, size=15, horizontalalignment="right")


def classify_dataset(clf, ds):
    figure = plt.figure(figsize=(12, 5))
    (Xtr, Xva, ytr, yva, xx, yy) = ds

    ## Train and run the model on the validation set:
#     clf.fit(Xtr, ytr)
#     accuracy = clf.score(Xva, yva)
    
    cv_results = cross_validate(clf, Xtr, ytr, cv=3, return_estimator=True)

    accuracy = cv_results["test_score"].mean()
    acc_std = cv_results["test_score"].std()
    
    for i, fitted_clf in enumerate(cv_results["estimator"]):
        # Also generate a decision score for every point in the 2D plot we're making
        # so we can visualize the decision boundary
        if hasattr(clf, "decision_function"):
            # some classifiers give a "decision function" that outputs a score
            Z = fitted_clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        else:
            # others output a probability of each class
            Z = fitted_clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

        ## The Plotting Part:
        ax = plt.subplot(1, 3, i+1)
        plot_clf_results(ds, Z, accuracy, ax)

    plt.tight_layout()
    plt.show()


Xtr, Xte, ytr, yte, xx, yy = scale_split((X, y))


cv_results = cross_validate(KNeighborsClassifier(3), Xtr, ytr, cv=3, return_estimator=True)
cv_results

{'fit_time': array([0.00140095, 0.00096202, 0.00089908]),
 'score_time': array([0.0041461 , 0.00303006, 0.00434685]),
 'estimator': [KNeighborsClassifier(n_neighbors=3),
  KNeighborsClassifier(n_neighbors=3),
  KNeighborsClassifier(n_neighbors=3)],
 'test_score': array([0.94366197, 0.94366197, 0.95714286])}


classify_dataset(SVC(kernel="linear", C=1), (Xtr, Xte, ytr, yte, xx, yy))


# The zoo, for reference - try out different classifiers
# in the cell above!
classifiers = {
    "Nearest Neighbors": KNeighborsClassifier(3),
    "Logistic Regression": LogisticRegression(),
    "Linear SVM": SVC(kernel="linear", C=0.025),
    "RBF SVM": SVC(gamma=2, C=1),
    "Decision Tree": DecisionTreeClassifier(max_depth=5),
    "Random Forest": RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    "Neural Network": MLPClassifier((20, 100), alpha=1, max_iter=1000),
    "Naive Bayes": GaussianNB(),
}

	danceability	energy	key	loudness	mode	speechiness	acousticness	instrumentalness	liveness	valence	tempo
0	0.831	0.814	2	-7.364	1	0.4200	0.0598	0.013400	0.0556	0.3890	156.985
1	0.719	0.493	8	-7.230	1	0.0794	0.4010	0.000000	0.1180	0.1240	115.080
2	0.850	0.893	5	-4.783	1	0.0623	0.0138	0.000004	0.3720	0.0391	218.050
3	0.476	0.781	0	-4.710	1	0.1030	0.0237	0.000000	0.1140	0.1750	186.948
4	0.798	0.624	2	-7.668	1	0.2930	0.2170	0.000000	0.1660	0.5910	147.988

	0	1	2
0	danceability	0.001093	-0.000543
1	energy	0.000178	0.001967
2	key	0.001555	0.999904
3	loudness	-0.01898	-0.011473
4	speechiness	-0.000872	-0.001013
5	acousticness	-0.000394	-0.000089
6	instrumentalness	0.003235	0.006906
7	liveness	-0.000211	0.000075
8	valence	-0.000573	0.00181
9	tempo	-0.999812	0.001795

	0	1	2
0	danceability	0.356769	0.05098
1	energy	-0.548753	-0.189263
2	key	-0.039263	0.045507
3	loudness	-0.355553	-0.486001
4	speechiness	0.268682	-0.430117
5	acousticness	0.427551	0.013116
6	instrumentalness	-0.302621	0.486642
7	liveness	-0.216143	-0.229131
8	valence	0.222488	-0.335637
9	tempo	0.028952	-0.369009

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
4	Adelie	Torgersen	36.7	19.3	193.0	3450.0	Female
5	Adelie	Torgersen	39.3	20.6	190.0	3650.0	Male
...	...	...	...	...	...	...	...
338	Gentoo	Biscoe	47.2	13.7	214.0	4925.0	Female
340	Gentoo	Biscoe	46.8	14.3	215.0	4850.0	Female
341	Gentoo	Biscoe	50.4	15.7	222.0	5750.0	Male
342	Gentoo	Biscoe	45.2	14.8	212.0	5200.0	Female
343	Gentoo	Biscoe	49.9	16.1	213.0	5400.0	Male

Lecture 35 - Preprocessing and Cross-Validation¶

Announcements:¶

Goals¶

Act I - A Tale of Scale¶

Act II: Cross-Validation, Hyperparameters, and how to tune them¶