import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

numerical_features = [
    'bill_length_mm',
#    'bill_depth_mm',
#    'flipper_length_mm',
    'body_mass_g'
]
penguins = sns.load_dataset("penguins").dropna().sample(frac=1, random_state=42)

# standardize the nmerical columns to produce X:
X = np.zeros_like(penguins[numerical_features].to_numpy())
for i, col in enumerate(numerical_features):
    c = penguins[col]
    X[:,i] = (c - c.mean()) / c.std()

print(X.shape)

y = penguins["species"]
print(y.shape)
print(y[0])

y = penguins["flipper_length_mm"]
print(y.shape)
print(y[0])

y = penguins["species"]

def L(p, a, b):
    """ Compute the L^p distance between vectors a and b
    Pre: p > 0 and a, b are d-dimensional 1d arrays """
    return np.sum(np.abs(a - b) ** p) ** (1/p)

def L_vectorized(p, X, b):
    """ Compute the L^p distance between each row of X (n, d)
    and b (d,). Returns a vector of size (n,). """
    n, d = X.shape
    return np.sum(np.abs(X - b.reshape((1, d))), axis=1) ** (1/p)

Xtrain = X[:300, :]
ytrain = y[:300]
Xval = X[300:, :]
yval = y[300:]

correct_guesses = 0

for i in range(len(yval)):
    pass
    # calculate the distances between each row of Xtrain and Xval[i,:]
    # find the label of the penguin with the smallest distance
    # check whether it matches the true label (y[i]), add one to correct_guesses if so

Lecture 19 - Machine Learning for Data Analysis¶

Classification and Regression; K-Nearest Neighbors¶

Announcements:¶

Goals:¶