import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


import sklearn.datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate


from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB


from sklearn.metrics import accuracy_score
from sklearn.metrics import top_k_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay


def scale_split(ds):
    """ Apply standard scaling and split into train/val sets."""
    X, y = ds
    Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42)
    Xtr, Xva, ytr, yva = train_test_split(Xtr, ytr, test_size=0.2, random_state=42)
    
    scaler = StandardScaler()
    Xtr = scaler.fit_transform(Xtr)
    Xva = scaler.transform(Xva)
    Xte = scaler.transform(Xte)

    return (Xtr, Xva, Xte, ytr, yva, yte)


forest = sklearn.datasets.fetch_covtype(as_frame=True)
forest["frame"]


X = forest["data"].sample(n=3000, random_state=42)
y = forest["target"].sample(n=3000, random_state=42)
plt.imshow(X.corr().to_numpy())

<matplotlib.image.AxesImage at 0x1677428e0>


(Xtrain, Xval, Xtest, ytrain, yval, ytest) = scale_split((X, y))


sns.histplot(ytrain)

<AxesSubplot:xlabel='Cover_Type', ylabel='Count'>


def run_eval(clf, Xtrain, ytrain, Xval, yval):
    def model_outputs(clf, X):
        if hasattr(clf, "decision_function"):
            return clf.decision_function(X)
        else:
            return clf.predict_proba(X)
        
    ytrain_pred = clf.predict(Xtrain)
    yval_pred = clf.predict(Xval)
    
    train_outputs = model_outputs(clf, Xtrain)
    val_outputs = model_outputs(clf, Xval)
    
    return (ytrain_pred, yval_pred, train_outputs, val_outputs)

def show_eval(eval_results, ytrain, yval):
    """ show the results of evaluating a model; eval_results should be:
        (ytrain_pred, yval_pred, train_outputs, val_outputs) as constructed
        by run_eval """
    (ytrain_pred, yval_pred, train_outputs, val_outputs) = eval_results

    confusion = confusion_matrix(yval, yval_pred)
    ConfusionMatrixDisplay(confusion).plot()
    return pd.DataFrame([
        ["Accuracy", accuracy_score(ytrain, ytrain_pred), accuracy_score(yval, yval_pred)],
        ["Top-2 Acc", top_k_accuracy_score(ytrain, train_outputs, k=2),
                      top_k_accuracy_score(yval, val_outputs, k=2)],
        ["Top-3 Acc", top_k_accuracy_score(ytrain, train_outputs, k=3),
                      top_k_accuracy_score(yval, val_outputs, k=3)]]
        ).rename(columns={1: "Train", 2: "Val"})


# mode baseline:
# predictions: just always say 2
ytrain_mode = 2*np.ones_like(ytrain)
yval_mode = 2*np.ones_like(yval)

# model probabilities/scores: assign score 1 to label 2, 0 to all others
train_outputs_mode = np.zeros((Xtrain.shape[0], 7))
train_outputs_mode[:, 1] = 1
train_outputs_mode[:, 0] = .5
train_outputs_mode[:, 2] = .2

val_outputs_mode = np.zeros((Xval.shape[0], 7))
val_outputs_mode[:, 1] = 1
val_outputs_mode[:, 0] = .5
val_outputs_mode[:, 2] = .2

show_eval((ytrain_mode, yval_mode, train_outputs_mode, val_outputs_mode), ytrain, yval)


for i in range(7):
    p = i - 5
    clf = SVC(kernel="linear", C=10**p)
    clf.fit(Xtrain, ytrain)

    eval_results = run_eval(clf, Xtrain, ytrain, Xval, yval)
    print(f"C = {10**p}")
    print(show_eval(eval_results, ytrain, yval))

C = 1e-05
           0     Train       Val
0   Accuracy  0.482292  0.443750
1  Top-2 Acc  0.851562  0.833333
2  Top-3 Acc  0.917188  0.897917
C = 0.0001
           0     Train       Val
0   Accuracy  0.482292  0.443750
1  Top-2 Acc  0.851562  0.833333
2  Top-3 Acc  0.917188  0.897917
C = 0.001
           0     Train       Val
0   Accuracy  0.656771  0.610417
1  Top-2 Acc  0.924479  0.904167
2  Top-3 Acc  0.969271  0.956250
C = 0.01
           0     Train       Val
0   Accuracy  0.714063  0.677083
1  Top-2 Acc  0.959896  0.937500
2  Top-3 Acc  0.990104  0.985417
C = 0.1
           0     Train       Val
0   Accuracy  0.743750  0.700000
1  Top-2 Acc  0.964583  0.943750
2  Top-3 Acc  0.993750  0.991667
C = 1
           0     Train       Val
0   Accuracy  0.749479  0.704167
1  Top-2 Acc  0.976042  0.945833
2  Top-3 Acc  0.998958  0.995833
C = 10
           0     Train       Val
0   Accuracy  0.755729  0.710417
1  Top-2 Acc  0.981771  0.947917
2  Top-3 Acc  0.999479  0.991667


classifiers = {
    "Nearest Neighbors": KNeighborsClassifier(3),
    "Logistic Regression": LogisticRegression(),
    "Linear SVM": SVC(kernel="linear", C=0.025),
    "RBF SVM": SVC(gamma=2, C=1),
    "Decision Tree": DecisionTreeClassifier(max_depth=5),
    "Random Forest": RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    "Neural Network": MLPClassifier((20, 100), alpha=1, max_iter=1000),
    "Naive Bayes": GaussianNB(),
}

	Elevation	Aspect	Slope	Horizontal_Distance_To_Hydrology	Vertical_Distance_To_Hydrology	Horizontal_Distance_To_Roadways	Hillshade_9am	Hillshade_Noon	Hillshade_3pm	Horizontal_Distance_To_Fire_Points	...	Soil_Type_31	Soil_Type_32	Soil_Type_33	Soil_Type_34	Soil_Type_35	Soil_Type_36	Soil_Type_37	Soil_Type_38	Soil_Type_39	Cover_Type
0	2596.0	51.0	3.0	258.0	0.0	510.0	221.0	232.0	148.0	6279.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	5
1	2590.0	56.0	2.0	212.0	-6.0	390.0	220.0	235.0	151.0	6225.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	5
2	2804.0	139.0	9.0	268.0	65.0	3180.0	234.0	238.0	135.0	6121.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	2
3	2785.0	155.0	18.0	242.0	118.0	3090.0	238.0	238.0	122.0	6211.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	2
4	2595.0	45.0	2.0	153.0	-1.0	391.0	220.0	234.0	150.0	6172.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	5
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
581007	2396.0	153.0	20.0	85.0	17.0	108.0	240.0	237.0	118.0	837.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	3
581008	2391.0	152.0	19.0	67.0	12.0	95.0	240.0	237.0	119.0	845.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	3
581009	2386.0	159.0	17.0	60.0	7.0	90.0	236.0	241.0	130.0	854.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	3
581010	2384.0	170.0	15.0	60.0	5.0	90.0	230.0	245.0	143.0	864.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	3
581011	2383.0	165.0	13.0	60.0	4.0	67.0	231.0	244.0	141.0	875.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	3

Lecture 36 - Model Selection Hackery: Hyperparameter Tuning¶

Announcements:¶

Goals:¶

Let's do some model development!¶

	0	Train	Val
0	Accuracy	0.482292	0.443750
1	Top-2 Acc	0.851562	0.833333
2	Top-3 Acc	0.917188	0.897917