import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn

RANDOM_SEED = 42

beans = pd.read_csv('/cluster/academic/DATA311/202620/drybean.csv')

beans

# separate features from labels
features = beans.drop(columns="Class")
labels = beans["Class"]

labels.value_counts()

Class
DERMASON    3546
SIRA        2636
SEKER       2027
HOROZ       1928
CALI        1630
BARBUNYA    1322
BOMBAY       522
Name: count, dtype: int64

from sklearn.model_selection import train_test_split

VAL_FRAC  = 0.2
TEST_FRAC = 0.2

features_rest, features_test, labels_rest, labels_test = sklearn.model_selection.train_test_split(
    features, labels,
    test_size=TEST_FRAC,
    shuffle=True, # default
    stratify=labels,
    random_state=RANDOM_SEED,
)

features_train, features_val, labels_train, labels_val = sklearn.model_selection.train_test_split(
    features_rest, labels_rest,
    test_size = VAL_FRAC,
    shuffle=True,
    stratify=labels_rest,
    random_state=RANDOM_SEED,
)

labels_train.value_counts()

Class
DERMASON    2269
SIRA        1687
SEKER       1297
HOROZ       1234
CALI        1043
BARBUNYA     846
BOMBAY       334
Name: count, dtype: int64

labels_train.value_counts().iloc[0] / labels_train.shape[0]

np.float64(0.26050516647531574)

# Naively take features as they are:
X_train = features_train.to_numpy()
X_val   = features_val.to_numpy()
X_test  = features_test.to_numpy()

# Convert numerical features to z-scores:
scaler = sklearn.preprocessing.StandardScaler().fit(features_train)
X_train = scaler.transform(features_train)
X_val   = scaler.transform(features_val)
X_test  = scaler.transform(features_test)

# Convert categorical labels to ordinal integers
encoder = sklearn.preprocessing.LabelEncoder().fit(labels_train)
y_train = encoder.transform(labels_train)
y_val   = encoder.transform(labels_val)
y_test  = encoder.transform(labels_test)

# Hyperparameters

K = 15

# simple metric choices: 'l1', 'l2'
METRIC = 'l2'

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=K, metric=METRIC)

knn.fit(X_train, y_train);

def evaluate(classifier, X_train, y_train, X_val, y_val):
    # Classification accuracy on training and validation sets:

    y_train_pred = knn.predict(X_train)
    y_val_pred   = knn.predict(X_val)

    train_acc = (y_train_pred == y_train).sum() / y_train.shape[0]
    val_acc   = (y_val_pred == y_val).sum() / y_val.shape[0]
    
    print(f"Training accuracy: {train_acc:.4f} ({train_acc*100:.2f}%)")
    print(f"Validation accuracy: {val_acc:.4f} ({val_acc*100:.2f}%)")

evaluate(knn, X_train, y_train, X_val, y_val)

Training accuracy: 0.9323 (93.23%)
Validation accuracy: 0.9311 (93.11%)

X_train.shape

(8710, 16)

	Area	Perimeter	MajorAxisLength	MinorAxisLength	AspectRatio	Eccentricity	ConvexArea	EquivDiameter	Extent	Solidity	Roundness	Compactness	ShapeFactor1	ShapeFactor2	ShapeFactor3	ShapeFactor4	Class
0	28395	610.291	208.178117	173.888747	1.197191	0.549812	28715	190.141097	0.763923	0.988856	0.958027	0.913358	0.007332	0.003147	0.834222	0.998724	SEKER
1	28734	638.018	200.524796	182.734419	1.097356	0.411785	29172	191.272751	0.783968	0.984986	0.887034	0.953861	0.006979	0.003564	0.909851	0.998430	SEKER
2	29380	624.110	212.826130	175.931143	1.209713	0.562727	29690	193.410904	0.778113	0.989559	0.947849	0.908774	0.007244	0.003048	0.825871	0.999066	SEKER
3	30008	645.884	210.557999	182.516516	1.153638	0.498616	30724	195.467062	0.782681	0.976696	0.903936	0.928329	0.007017	0.003215	0.861794	0.994199	SEKER
4	30140	620.134	201.847882	190.279279	1.060798	0.333680	30417	195.896503	0.773098	0.990893	0.984877	0.970516	0.006697	0.003665	0.941900	0.999166	SEKER
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
13606	42097	759.696	288.721612	185.944705	1.552728	0.765002	42508	231.515799	0.714574	0.990331	0.916603	0.801865	0.006858	0.001749	0.642988	0.998385	DERMASON
13607	42101	757.499	281.576392	190.713136	1.476439	0.735702	42494	231.526798	0.799943	0.990752	0.922015	0.822252	0.006688	0.001886	0.676099	0.998219	DERMASON
13608	42139	759.321	281.539928	191.187979	1.472582	0.734065	42569	231.631261	0.729932	0.989899	0.918424	0.822730	0.006681	0.001888	0.676884	0.996767	DERMASON
13609	42147	763.779	283.382636	190.275731	1.489326	0.741055	42667	231.653247	0.705389	0.987813	0.907906	0.817457	0.006724	0.001852	0.668237	0.995222	DERMASON
13610	42159	772.237	295.142741	182.204716	1.619841	0.786693	42600	231.686223	0.788962	0.989648	0.888380	0.784997	0.007001	0.001640	0.616221	0.998180	DERMASON

Lecture 21 - Multiclass Classification Example, end-to-end¶

Goals:¶

1: Load and Split the Data¶

2: Baselines¶

3. Feature Extraction¶

4. Learn the Machine!¶

5. Evaluate Performance¶

Where could we go from here?¶